openvinotoolkit
diff --git a/‎nncf/openvino/optimized_functions/__init__.py
+1 b/‎nncf/openvino/optimized_functions/__init__.py
+1
diff --git a/‎nncf/openvino/optimized_functions/functions.py
+44 b/‎nncf/openvino/optimized_functions/functions.py
+44
diff --git a/‎nncf/openvino/optimized_functions/models.py
+112 b/‎nncf/openvino/optimized_functions/models.py
+112
diff --git a/‎nncf/quantization/algorithms/weight_compression/awq.py
+4-4 b/‎nncf/quantization/algorithms/weight_compression/awq.py
+4-4
diff --git a/‎nncf/quantization/algorithms/weight_compression/gptq.py
+7-5 b/‎nncf/quantization/algorithms/weight_compression/gptq.py
+7-5
diff --git a/‎nncf/quantization/algorithms/weight_compression/lora_correction.py
+1-5 b/‎nncf/quantization/algorithms/weight_compression/lora_correction.py
+1-5
diff --git a/‎nncf/quantization/algorithms/weight_compression/scale_estimation.py
+9-7 b/‎nncf/quantization/algorithms/weight_compression/scale_estimation.py
+9-7
@@ -10,6 +10,7 @@
 # limitations under the License.
 
 from nncf.openvino.optimized_functions.functions import astype as astype
+from nncf.openvino.optimized_functions.functions import do_float_quantization as do_float_quantization
 from nncf.openvino.optimized_functions.functions import do_integer_quantization as do_integer_quantization
 from nncf.openvino.optimized_functions.functions import get_integer_quantization_error as get_integer_quantization_error
 from nncf.openvino.optimized_functions.functions import (
 
@@ -11,10 +11,12 @@
 
 from typing import Optional, Tuple, Union
 
+from nncf import CompressWeightsMode
 from nncf.common.utils.caching import disable_results_caching
 from nncf.openvino.optimized_functions.models import OV_MODEL_CACHE
 from nncf.openvino.optimized_functions.models import OVModelParameters
 from nncf.openvino.optimized_functions.models import get_astype_model
+from nncf.openvino.optimized_functions.models import get_float_quantization_model
 from nncf.openvino.optimized_functions.models import get_integer_quantization_error_model
 from nncf.openvino.optimized_functions.models import get_integer_quantization_model
 from nncf.openvino.optimized_functions.models import get_integer_quantize_dequantize_weight_model
@@ -97,6 +99,48 @@ def do_integer_quantization(
     return compressed_weight, scale, zero_point
 
 
+def do_float_quantization(
+    weight: Tensor,
+    config: WeightCompressionConfig,
+    reduction_axes: Optional[ReductionAxes] = None,
+    precomputed_scale: Tensor = None,
+) -> Tuple[Tensor, Tensor]:
+    weight_shape = weight.shape
+    scale_shape = None if precomputed_scale is None else precomputed_scale.shape
+
+    ov_model_params = OVModelParameters()
+    ov_model_params.input_dtypes["weight"] = weight.dtype
+    if precomputed_scale is not None:
+        ov_model_params.input_dtypes["scale"] = precomputed_scale.dtype
+    if config.num_bits == 4 and weight.backend == TensorBackend.ov:
+        # Return ov tensors in target precision to seamlessly insert them into openvino model later
+        ov_model_params.return_ov_tensors = True
+        dtype = TensorDataType.nf4 if config.mode == CompressWeightsMode.NF4 else TensorDataType.f4e2m1
+        ov_model_params.output_dtypes.update({"compressed_weight": dtype})
+
+    model = get_float_quantization_model(
+        ov_model_params,
+        config,
+        weight_shape,
+        scale_shape,
+        reduction_axes,
+    )
+
+    if precomputed_scale is None:
+        # weight -> compressed_weight, scale
+        compressed_weight, scale = model([weight])
+
+        # Scale is always in fp32 so there is no need to store it in ov.Tensor
+        if scale.backend == TensorBackend.ov:
+            scale = scale.as_numpy_tensor()
+    else:
+        # weight, scale -> compressed_weight
+        compressed_weight = model([weight, precomputed_scale])[0]
+        scale = precomputed_scale
+
+    return compressed_weight, scale
+
+
 def integer_quantize_dequantize_weight(
     weight: Tensor,
     config: WeightCompressionConfig,
 
@@ -22,6 +22,7 @@
 from openvino.runtime import Node
 from openvino.runtime import opset13 as opset
 
+from nncf import CompressWeightsMode
 from nncf.common.utils.backend import is_openvino_at_least
 from nncf.common.utils.caching import ResultsCache
 from nncf.common.utils.caching import cache_results
@@ -233,6 +234,26 @@ def get_integer_quantization_model(
     )
 
 
+def get_float_quantization_model(
+    ov_model_params: OVModelParameters,
+    config: WeightCompressionConfig,
+    weight_shape: Tuple,
+    scale_shape: Optional[Tuple] = None,
+    reduction_axes: Optional[ReductionAxes] = None,
+) -> Union[ModelCallable, ModelAsNodes]:
+    weight_shape, scale_shape, _ = _prepare_quantization_model_inputs(
+        ov_model_params, weight_shape, scale_shape, zero_point_shape=None, reduction_axes=reduction_axes
+    )
+
+    return _build_float_quantization_model(
+        config,
+        ov_model_params,
+        weight_shape,
+        scale_shape,
+        reduction_axes,
+    )
+
+
 def get_integer_quantize_dequantize_weight_model(
     ov_model_params: OVModelParameters,
     config: WeightCompressionConfig,
@@ -453,6 +474,97 @@ def _build_integer_quantization_model(
     return partial(_infer_ov_model, ov_model_params, compiled_model)
 
 
+@cache_results(OV_MODEL_CACHE)
+def _build_float_quantization_model(
+    config: WeightCompressionConfig,
+    ov_model_params: OVModelParameters,
+    weight_shape: Tuple,
+    scale_shape: Optional[Tuple] = None,
+    reduction_axes: Optional[ReductionAxes] = None,
+    return_nodes: bool = False,
+) -> Union[ModelCallable, ModelAsNodes]:
+    default_input_dtypes = {"scale": TensorDataType.float32}
+    default_output_dtypes = {"compressed_weight": TensorDataType.float32, "scale": TensorDataType.float32}
+
+    # Update input and output dtypes with the default values
+    ov_model_params = copy.deepcopy(ov_model_params)
+    ov_model_params.input_dtypes = {**default_input_dtypes, **ov_model_params.input_dtypes}
+    ov_model_params.output_dtypes = {**default_output_dtypes, **ov_model_params.output_dtypes}
+
+    if "weight" not in ov_model_params.input_dtypes:
+        msg = "Input weight dtype is required!"
+        raise ValueError(msg)
+
+    weight_dtype = ov_model_params.input_dtypes["weight"]
+    input_scale_dtype = ov_model_params.input_dtypes["scale"]
+    compressed_weight_dtype = ov_model_params.output_dtypes["compressed_weight"]
+    output_scale_dtype = ov_model_params.output_dtypes["scale"]
+
+    # Validate input dtypes
+    valid_weight_dtypes = [TensorDataType.float32, TensorDataType.float16, TensorDataType.bfloat16]
+    if weight_dtype not in valid_weight_dtypes:
+        msg = f"Weight must be one of the following data types: {valid_weight_dtypes}. But found: {weight_dtype}."
+        raise ValueError(msg)
+    if scale_shape is not None and input_scale_dtype != TensorDataType.float32:
+        msg = f"Input scale must be of float32 data type. But found: {input_scale_dtype}."
+        raise ValueError(msg)
+
+    # Validate output dtypes
+    # TODO: Enable f4e2m1
+    valid_compressed_weight_dtypes = [TensorDataType.float32, TensorDataType.nf4]
+    if compressed_weight_dtype not in valid_compressed_weight_dtypes:
+        msg = (
+            f"Compressed weight must be one of the following data types: {valid_compressed_weight_dtypes}. "
+            f"But found: {compressed_weight_dtype}."
+        )
+        raise ValueError(msg)
+    if scale_shape is None and output_scale_dtype != TensorDataType.float32:
+        msg = f"Output scale must be of float32 data type. But found: {output_scale_dtype}."
+        raise ValueError(msg)
+
+    # Build OV model
+    weight = opset.parameter(weight_shape, name="weight", dtype=DTYPE_MAP_OV[weight_dtype])
+    ov_parameters = [weight]
+    weight = convert_op(weight, ov.Type.f32)
+
+    divide_op = opset.divide if ov_model_params.convertable_division else non_convertable_divide_op
+    if scale_shape is not None:
+        # Scale is given as an input
+        scale = opset.parameter(scale_shape, name="scale", dtype=DTYPE_MAP_OV[input_scale_dtype])
+        ov_parameters.append(scale)
+    else:
+        # Compute scale
+        scale = opset.reduce_max(opset.abs(weight), reduction_axes=reduction_axes, keep_dims=True)
+        # NOTE: adding machine epsilon to avoid division by zero
+        eps = np.finfo(np.float32).eps
+        scale = opset.select(opset.less(opset.abs(scale), eps), eps, scale)
+
+        if config.mode == CompressWeightsMode.E2M1:
+            max_val = opset.constant(6, ov.Type.f32)  # Maximal value of e2m1 type.
+            constant_2 = opset.constant(2, ov.Type.f32)
+            scale = divide_op(scale, max_val)
+            scale = opset.log(scale) / opset.log(constant_2)
+            scale = opset.ceil(scale)
+            scale = opset.clamp(scale, -127, 127)
+            scale = opset.power(constant_2, scale)
+
+    compressed_weight = divide_op(weight, scale)
+    compressed_weight = convert_op(compressed_weight, ov.Type.nf4)
+    compressed_weight = convert_op(compressed_weight, DTYPE_MAP_OV[compressed_weight_dtype])
+
+    ov_results = [compressed_weight]
+    if len(ov_parameters) == 1:
+        ov_results.append(scale)
+
+    if return_nodes:
+        return ov_parameters, ov_results, ov_model_params
+
+    model = ov.Model(ov_results, ov_parameters)
+    compiled_model = _compile_ov_model(model, device_name="CPU", config={inference_precision(): ov.Type.f32})
+
+    return partial(_infer_ov_model, ov_model_params, compiled_model)
+
+
 @cache_results(OV_MODEL_CACHE)
 def _build_integer_quantize_dequantize_weight_model(
     config: WeightCompressionConfig,
 
@@ -30,9 +30,8 @@
 from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
-from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_nf4_quantized_weight
-from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_nf4_scale
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_float_dequantization
+from nncf.quantization.algorithms.weight_compression.weight_lowering import do_float_quantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import integer_quantize_dequantize_weight
 from nncf.quantization.passes import transform_to_inference_graph
 from nncf.tensor import TensorDataType
@@ -255,8 +254,9 @@ def apply(
                     cur_scale = gscale**alpha
                     weights_to_fake_quantize = gweight * cur_scale
                     if config.mode == CompressWeightsMode.NF4:
-                        g_c_scale = calculate_nf4_scale(weights_to_fake_quantize, reduction_axis)
-                        g_compressed_weighs = calculate_nf4_quantized_weight(weights_to_fake_quantize, g_c_scale)
+                        g_compressed_weighs, g_c_scale = do_float_quantization(
+                            weights_to_fake_quantize, config, reduction_axis
+                        )
                         g_decompressed_weighs = do_float_dequantization(g_compressed_weighs, g_c_scale)
                     else:
                         g_decompressed_weighs = integer_quantize_dequantize_weight(
 
@@ -26,10 +26,10 @@
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.scale_estimation import ScaleEstimation
+from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_float_quantization_params
 from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_integer_quantization_params
-from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_nf4_quantized_weight
-from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_nf4_scale
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_float_dequantization
+from nncf.quantization.algorithms.weight_compression.weight_lowering import do_float_quantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import integer_quantize_dequantize_weight
 from nncf.tensor import Tensor
 from nncf.tensor import functions as fns
@@ -262,7 +262,9 @@ def _quantize_weights(
 
                 if (i1 + i) % group_size == 0:
                     if block_compression_config.mode == CompressWeightsMode.NF4:
-                        scale = calculate_nf4_scale(weight_tensor[:, (i1 + i) : (i1 + i + group_size)], reduction_axes)
+                        scale = calculate_float_quantization_params(
+                            weight_tensor[:, (i1 + i) : (i1 + i + group_size)], reduction_axes, block_compression_config
+                        )
                         scales.append(scale)
                     else:
                         if self._scale_estimation and block_compression_config.num_bits == 4:
@@ -284,8 +286,8 @@ def _quantize_weights(
                         zero_points.append(zero_point)
 
                 if block_compression_config.mode == CompressWeightsMode.NF4:
-                    compressed_weights = calculate_nf4_quantized_weight(
-                        fns.unsqueeze(weight_col, 1), scales[-1], is_normalized_weight=False
+                    compressed_weights, _ = do_float_quantization(
+                        fns.unsqueeze(weight_col, 1), block_compression_config, precomputed_scale=scales[-1]
                     )
                     quantized_col = do_float_dequantization(compressed_weights, scales[-1], reduction_axis=-1)
                 else:
 
@@ -25,7 +25,6 @@
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.weight_lowering import CompressedWeight
-from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_nf4_quantized_weight
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_float_dequantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_dequantization
 from nncf.tensor import Tensor
@@ -177,10 +176,7 @@ def calculate_low_rank_matrices(
                 reduction_axis,
             )
         elif mode == CompressWeightsMode.NF4:
-            indexes = calculate_nf4_quantized_weight(
-                compressed_weight.tensor, compressed_weight.scale, is_normalized_weight=True
-            )
-            fq_weights = do_float_dequantization(indexes, compressed_weight.scale, reduction_axis)
+            fq_weights = do_float_dequantization(compressed_weight.tensor, compressed_weight.scale, reduction_axis)
         else:
             msg = (
                 f"{mode.value} mode is invalid for Lora Correction algorithm. Supported modes: INT4_SYM, INT4_ASYM, NF4"
 
@@ -23,7 +23,6 @@
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
-from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_nf4_quantized_weight
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_float_dequantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_float_quantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization
@@ -199,8 +198,7 @@ def calculate_quantization_params(
 
         original_weight = fns.zeros_like(weight) + weight
         if config.mode == CompressWeightsMode.NF4:
-            norm_weight, scale = do_float_quantization(original_weight, reduction_axis, cur_config.group_size)
-            compressed_weights = calculate_nf4_quantized_weight(norm_weight, scale, is_normalized_weight=True)
+            compressed_weights, scale = do_float_quantization(original_weight, cur_config, reduction_axis)
             q_weights = do_float_dequantization(compressed_weights, scale, reduction_axis)
             q_weights, _ = reshape_weight_for_grouped_quantization(q_weights, reduction_axis, group_size)
             zp = None
@@ -249,7 +247,9 @@ def calculate_quantization_params(
             near_to_ideal_scale = near_to_ideal_scale * scale_sign
 
             if config.mode == CompressWeightsMode.NF4:
-                g_compressed_weighs = calculate_nf4_quantized_weight(original_weight, near_to_ideal_scale)
+                g_compressed_weighs, _ = do_float_quantization(
+                    original_weight, config, precomputed_scale=near_to_ideal_scale
+                )
                 out = do_float_dequantization(g_compressed_weighs, near_to_ideal_scale)
             else:
                 out = integer_quantize_dequantize_weight(
@@ -284,7 +284,7 @@ def calculate_quantization_params(
 
             if i < initial_steps - 1:
                 if config.mode == CompressWeightsMode.NF4:
-                    out = calculate_nf4_quantized_weight(original_weight, near_to_ideal_scale)
+                    out, _ = do_float_quantization(original_weight, config, precomputed_scale=near_to_ideal_scale)
                 else:
                     out, _, _ = do_integer_quantization(
                         original_weight,
@@ -302,7 +302,7 @@ def calculate_quantization_params(
             scaled_scale = factor * scale
 
             if config.mode == CompressWeightsMode.NF4:
-                out = calculate_nf4_quantized_weight(original_weight, scaled_scale)
+                out, _ = do_float_quantization(original_weight, config, precomputed_scale=scaled_scale)
             else:
                 out, _, _ = do_integer_quantization(
                     original_weight,
@@ -318,7 +318,9 @@ def calculate_quantization_params(
             near_to_ideal_scale = near_to_ideal_scale * scale_sign
 
             if config.mode == CompressWeightsMode.NF4:
-                g_compressed_weighs = calculate_nf4_quantized_weight(original_weight, near_to_ideal_scale)
+                g_compressed_weighs, _ = do_float_quantization(
+                    original_weight, config, precomputed_scale=near_to_ideal_scale
+                )
                 out = do_float_dequantization(g_compressed_weighs, near_to_ideal_scale)
             else:
                 out = integer_quantize_dequantize_weight(