openvinotoolkit
diff --git a/‎nncf/openvino/graph/node_utils.py
+16-7 b/‎nncf/openvino/graph/node_utils.py
+16-7
diff --git a/‎nncf/openvino/optimized_functions/__init__.py
+1 b/‎nncf/openvino/optimized_functions/__init__.py
+1
diff --git a/‎nncf/openvino/optimized_functions/functions.py
+37 b/‎nncf/openvino/optimized_functions/functions.py
+37
diff --git a/‎nncf/openvino/optimized_functions/models.py
+72-7 b/‎nncf/openvino/optimized_functions/models.py
+72-7
diff --git a/‎nncf/quantization/algorithms/weight_compression/mixed_precision.py
+8-3 b/‎nncf/quantization/algorithms/weight_compression/mixed_precision.py
+8-3
diff --git a/‎nncf/quantization/algorithms/weight_compression/openvino_backend.py
+17-5 b/‎nncf/quantization/algorithms/weight_compression/openvino_backend.py
+17-5
@@ -110,21 +110,30 @@ def cnt_if_op(model: ov.Model, cnt: int) -> int:
     return cnt_if_op(model, 0)
 
 
-def get_const_value(const_node: ov.Node, cast_bf16_to_fp32: bool = True) -> np.ndarray:
+def get_const_value_as_numpy_tensor(const_node: ov.Node) -> np.ndarray:
     """
-    Returns the constant tensor for the node.
+    Returns the constant tensor for the node as an instance of np.ndarray. BF16 constants will be converted to FP32.
     This method is applicable only for the floating-point constant data.
 
     :param const_node: OpenVINO node.
-    :param cast_bf16_to_fp32: Whether to cast bf16 node data to fp32 or not. If False and the node contains bf16 data,
-        the resulting bf16 value will be returned encoded inside a numpy.float16 array.
     :return: The constant value.
     """
-    if const_node.get_element_type() == ov.Type.bf16 and cast_bf16_to_fp32:
+    if const_node.get_element_type() == ov.Type.bf16:
         return const_node.get_data(dtype=np.float32)
     return const_node.data
 
 
+def get_const_value_as_ov_tensor(const_node: ov.Node) -> ov.Tensor:
+    """
+    Returns the constant tensor for the node as an instance of openvino.Tensor which is useful when BF16 constant
+    needs to be retrieved as is.
+
+    :param const_node: OpenVINO node.
+    :return: The constant value as openvino.Tensor.
+    """
+    return ov.Tensor(const_node.data, const_node.get_output_shape(0), const_node.get_element_type())
+
+
 def get_bias_value(
     node_with_bias: NNCFNode, nncf_graph: NNCFGraph, model: ov.Model, node_mapping: Dict[str, ov.Node] = None
 ) -> np.ndarray:
@@ -141,7 +150,7 @@ def get_bias_value(
         node_mapping = {op.get_friendly_name(): op for op in model.get_ops()}
     bias_constant = get_node_with_bias_value(get_add_bias_node(node_with_bias, nncf_graph), nncf_graph)
     ov_bias_constant = node_mapping[bias_constant.node_name]
-    return get_const_value(ov_bias_constant)
+    return get_const_value_as_numpy_tensor(ov_bias_constant)
 
 
 def get_weight_value(node_with_weight: NNCFNode, model: ov.Model, port_id: int) -> np.ndarray:
@@ -157,7 +166,7 @@ def get_weight_value(node_with_weight: NNCFNode, model: ov.Model, port_id: int)
     const_op_friendly_name = node_with_weight.layer_attributes.constant_attributes[port_id]["name"]
     friendly_name_to_op_map = {op.get_friendly_name(): op for op in model.get_ops()}
     const_op = friendly_name_to_op_map[const_op_friendly_name]
-    weight_tensor = get_const_value(const_op)
+    weight_tensor = get_const_value_as_numpy_tensor(const_op)
     return weight_tensor
 
 
 
@@ -11,6 +11,7 @@
 
 from nncf.openvino.optimized_functions.functions import astype as astype
 from nncf.openvino.optimized_functions.functions import do_int_quantization as do_int_quantization
+from nncf.openvino.optimized_functions.functions import get_integer_quantization_error as get_integer_quantization_error
 from nncf.openvino.optimized_functions.functions import quantize_dequantize_weight as quantize_dequantize_weight
 from nncf.openvino.optimized_functions.models import OVModelParameters as OVModelParameters
 from nncf.openvino.optimized_functions.models import clear_ov_model_cache as clear_ov_model_cache
@@ -17,6 +17,7 @@
 from nncf.openvino.optimized_functions.models import get_astype_model
 from nncf.openvino.optimized_functions.models import get_compress_decompress_weight_model
 from nncf.openvino.optimized_functions.models import get_compress_weight_model
+from nncf.openvino.optimized_functions.models import get_quantization_error_model
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization
 from nncf.tensor import Tensor
@@ -168,6 +169,42 @@ def quantize_dequantize_weight(
         return decompressed_weight
 
 
+def get_integer_quantization_error(
+    weight: Tensor,
+    reduction_axes: ReductionAxes,
+    config: WeightCompressionConfig,
+) -> float:
+    """
+    Calculates a quantity characterizing the difference between floating point weights and fake quantized
+    (compressed and decompressed) to integer ones.
+
+    The error is computed as follows:
+    error = max(mean((decompressed_weight - weight)^2, axis=reduction_axes))
+
+    :param weight: Weight array to compress.
+    :param reduction_axes: Axes, along which to reduce (collect) different statistics (e.g. min, max).
+    :param config: Information on how to compress (quantize) a specific weight.
+    :return: The quantity characterizing the error of integer quantization.
+    """
+    original_weight_shape = weight.shape
+    original_reduction_axes = reduction_axes
+
+    # When reduction axes are not provided, assuming that the weights are already reshaped
+    if config.group_size != -1 and reduction_axes is not None:
+        # weights are reshaped from [a1, r, a2] to [a1, r//gs, gs, a2]
+        weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, config.group_size)
+
+    ov_model_params = OVModelParameters()
+    ov_model_params.input_dtypes["weight"] = weight.dtype
+    model = get_quantization_error_model(
+        ov_model_params, config, original_weight_shape, weight.shape, original_reduction_axes, reduction_axes
+    )
+
+    quantization_error = model([weight])[0].item()
+
+    return quantization_error
+
+
 def astype(a: Tensor, dtype: TensorDataType) -> Tensor:
     """
     Converts the given tensor to the specified data type. Allows to convert between u4, i4, bf16 data types which are
 
@@ -203,7 +203,6 @@ def get_compress_weight_model(
     scale_shape: Optional[Tuple] = None,
     zero_point_shape: Optional[Tuple] = None,
     reduction_axes: Optional[ReductionAxes] = None,
-    return_nodes: Optional[bool] = False,
 ) -> Union[ModelCallable, ModelAsNodes]:
     """
     Get a model that compresses weights using the given configuration.
@@ -217,8 +216,6 @@ def get_compress_weight_model(
         as an input.
     :param reduction_axes: Optional axes to reduce the weight tensor. Not needed if scale (and z.p.) are provided as
         inputs.
-    :param return_nodes: Whether to return the OV model inputs parameters and results nodes instead of the model
-        callable.
     :return: A model callable that compresses weights using the given configuration. Or a model as nodes, if
         `return_nodes` is True.
     """
@@ -233,7 +230,6 @@ def get_compress_weight_model(
         scale_shape,
         zero_point_shape,
         reduction_axes,
-        return_nodes=return_nodes,
     )
 
 
@@ -278,6 +274,35 @@ def get_compress_decompress_weight_model(
     )
 
 
+def get_quantization_error_model(
+    ov_model_params: OVModelParameters,
+    config: WeightCompressionConfig,
+    original_weight_shape: Tuple,
+    weight_shape: Tuple,
+    original_reduction_axes: ReductionAxes,
+    reduction_axes: ReductionAxes,
+) -> ModelCallable:
+    """
+    Get a model that calculates the quantization error for a given weight.
+
+    This function builds a model that compresses and then decompresses the given weight, and calculates the
+    quantization error by comparing the original weight with the decompressed weight.
+
+    :param ov_model_params: OV model parameters.
+    :param config: Compression configuration.
+    :param original_weight_shape: Shape of the original weight tensor.
+    :param weight_shape: Shape of the weight tensor to be compressed.
+    :param original_reduction_axes: Reduction axes of the original weight tensor before reshaping.
+    :param reduction_axes: Axes to reduce the weight tensor.
+    :return: A model callable that returns the quantization error.
+    """
+    weight_shape, _, _ = _prepare_compression_model_inputs(ov_model_params, weight_shape, None, None, reduction_axes)
+
+    return _build_quantization_error_model(
+        config, ov_model_params, original_weight_shape, weight_shape, original_reduction_axes, reduction_axes
+    )
+
+
 @cache_results(OV_MODEL_CACHE)
 def _build_compress_model(
     config: WeightCompressionConfig,
@@ -437,7 +462,8 @@ def _build_compress_decompress_model(
     zero_point_shape: Optional[Tuple] = None,
     reduction_axes: Optional[ReductionAxes] = None,
     return_compressed_weight: Optional[bool] = False,
-) -> ModelCallable:
+    return_nodes: Optional[bool] = False,
+) -> Union[ModelCallable, ModelAsNodes]:
     default_output_dtypes = {"decompressed_weight": TensorDataType.float32}
     if not return_compressed_weight:
         # If compressed weight is not returned to a user, we can keep it in float32 to avoid additional conversion
@@ -451,8 +477,8 @@ def _build_compress_decompress_model(
         raise ValueError(msg)
 
     # Get compression model as input/result nodes and potentially modified ov model parameters
-    ov_parameters, ov_results, ov_model_params = get_compress_weight_model(
-        ov_model_params, config, weight_shape, scale_shape, zero_point_shape, reduction_axes, return_nodes=True
+    ov_parameters, ov_results, ov_model_params = _build_compress_model(
+        config, ov_model_params, weight_shape, scale_shape, zero_point_shape, reduction_axes, return_nodes=True
     )
 
     if config.is_asym_mode:
@@ -477,12 +503,51 @@ def _build_compress_decompress_model(
     decompressed_weight = opset.multiply(scale, convert_op(compressed_weight, ov.Type.f32))
 
     ov_results = [decompressed_weight] + ov_results if return_compressed_weight else [decompressed_weight]
+
+    if return_nodes:
+        return ov_parameters, ov_results, ov_model_params
+
     model = ov.Model(ov_results, ov_parameters)
     compiled_model = _compile_ov_model(model, device_name="CPU", config={inference_precision(): ov.Type.f32})
 
     return partial(_infer_ov_model, ov_model_params, compiled_model)
 
 
+@cache_results(OV_MODEL_CACHE)
+def _build_quantization_error_model(
+    config: WeightCompressionConfig,
+    ov_model_params: OVModelParameters,
+    original_weight_shape: Tuple,
+    weight_shape: Tuple,
+    original_reduction_axes: ReductionAxes,
+    reduction_axes: ReductionAxes,
+) -> ModelCallable:
+    ov_parameters, ov_results, ov_model_params = _build_compress_decompress_model(
+        config,
+        ov_model_params,
+        weight_shape,
+        reduction_axes=reduction_axes,
+        return_compressed_weight=False,
+        return_nodes=True,
+    )
+
+    weight = ov_parameters[0]
+    decompressed_weight = ov_results[0]
+
+    weight = convert_op(opset.reshape(weight, original_weight_shape, special_zero=False), ov.Type.f32)
+    decompressed_weight = convert_op(
+        opset.reshape(decompressed_weight, original_weight_shape, special_zero=False), ov.Type.f32
+    )
+    diff = opset.squared_difference(decompressed_weight, weight)
+    layer_err = opset.reduce_mean(diff, reduction_axes=original_reduction_axes)
+    quantization_error = opset.reduce_max(layer_err, reduction_axes=tuple(range(len(layer_err.shape))))
+
+    model = ov.Model([quantization_error], ov_parameters)
+    compiled_model = _compile_ov_model(model, device_name="CPU", config={inference_precision(): ov.Type.f32})
+
+    return partial(_infer_ov_model, ov_model_params, compiled_model)
+
+
 def get_astype_model(ov_model_params: OVModelParameters, input_shape: Tuple) -> ModelCallable:
     """
     Return a model that cast the input of the given shape to the given data type. Especially useful for
 
@@ -139,9 +139,11 @@ def available_backends(self) -> List[BackendType]:
     def _set_backend_entity(self, model: TModel) -> None:
         model_backend = get_backend(model)
         if model_backend == BackendType.OPENVINO:
-            from nncf.quantization.algorithms.weight_compression.openvino_backend import OVWeightCompressionAlgoBackend
+            from nncf.quantization.algorithms.weight_compression.openvino_backend import (
+                OVTensorWeightCompressionAlgoBackend,
+            )
 
-            self._backend_entity = OVWeightCompressionAlgoBackend(model)
+            self._backend_entity = OVTensorWeightCompressionAlgoBackend(model)
         elif model_backend == BackendType.TORCH:
             from nncf.quantization.algorithms.weight_compression.torch_backend import PTWeightCompressionAlgoBackend
 
@@ -161,7 +163,10 @@ def _calc_weight_sensitivity(
         graph: NNCFGraph,
     ) -> float:
         weight = self._backend_entity.get_weight(
-            weight_param.node_with_weight, weight_param.weight_port_id, model, graph
+            weight_param.node_with_weight,
+            weight_param.weight_port_id,
+            model,
+            graph,
         )
         backup_config = WeightCompressionConfig()
         reduction_axes = weight_param.reduction_axes
 
@@ -33,7 +33,8 @@
 from nncf.openvino.graph.model_transformer import OVModelTransformer
 from nncf.openvino.graph.node_utils import convert_op
 from nncf.openvino.graph.node_utils import create_ov_const_from_tensor
-from nncf.openvino.graph.node_utils import get_const_value
+from nncf.openvino.graph.node_utils import get_const_value_as_numpy_tensor
+from nncf.openvino.graph.node_utils import get_const_value_as_ov_tensor
 from nncf.openvino.graph.node_utils import get_weight_channel_axes
 from nncf.openvino.graph.transformations.command_creation import OVCommandCreator
 from nncf.openvino.graph.transformations.commands import OVTargetPoint
@@ -131,7 +132,7 @@ def get_weight_names_and_port_ids(node: NNCFNode, graph: NNCFGraph) -> List[Tupl
     def get_weight(self, node_with_weight: NNCFNode, weight_port_id: int, model: ov.Model, graph: NNCFGraph) -> Tensor:
         weight_name = node_with_weight.layer_attributes.constant_attributes[weight_port_id]["name"]
         weight_node = self.name_to_node_mapping[weight_name]
-        weight_tensor = get_const_value(weight_node)
+        weight_tensor = get_const_value_as_numpy_tensor(weight_node)
         return Tensor(weight_tensor)
 
     def get_weight_dtype(
@@ -298,12 +299,10 @@ def transform_model(
             const_node = self.name_to_node_mapping[const_node_name]
             const_node_output = const_node.output(0)
             const_dtype = const_node_output.get_element_type()
-            weight = get_const_value(const_node, cast_bf16_to_fp32=False)
             # Creation of ov.Tensor is required for two reasons:
             #   1. To be able to process BF16 weight properly
             #   2. To indicate that it is allowed for the compressed constant to be returned as int4/uint4 if needed
-            weight = ov.Tensor(weight, weight.shape, const_dtype)
-            weight = Tensor(weight)
+            weight = Tensor(get_const_value_as_ov_tensor(const_node))
 
             should_add_convert_node = False
             if const_dtype != ov.Type.f16:
@@ -365,6 +364,19 @@ def filter_func(point: StatisticPoint) -> bool:
         return filter_func
 
 
+class OVTensorWeightCompressionAlgoBackend(OVWeightCompressionAlgoBackend):
+    """
+    OpenVINO backend for weight compression algorithms that fetches model weights as openvino.Tensor instances.
+    This allows to natively process BF16/FP16 weights.
+    """
+
+    def get_weight(self, node_with_weight: NNCFNode, weight_port_id: int, model: ov.Model, graph: NNCFGraph) -> Tensor:
+        weight_name = node_with_weight.layer_attributes.constant_attributes[weight_port_id]["name"]
+        weight_node = self.name_to_node_mapping[weight_name]
+        weight_tensor = get_const_value_as_ov_tensor(weight_node)
+        return Tensor(weight_tensor)
+
+
 class OVAWQAlgoAlgoBackend(AWQAlgoBackend, OVWeightCompressionAlgoBackend):
     @staticmethod
     def get_awq_patterns():