openvinotoolkit · nikita-savelyevv · Mar 20, 2025 · Mar 21, 2025 · Mar 21, 2025 · Mar 21, 2025
@@ -91,6 +91,8 @@ jobs:
         shell: bash
       - name: Install NNCF and test requirements
         run: pip install . -r tests/openvino/requirements.txt
+      - name: Install OV RC
+        run: pip install -U --pre openvino --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
       - name: Print installed modules
         run: pip list
       - name: Run OV precommit test scope

@@ -2,6 +2,7 @@ name: Weight compression
 permissions: read-all
 
 on:
+  pull_request:
   workflow_call:
   workflow_dispatch:
     inputs:
@@ -41,6 +42,8 @@ jobs:
         run: cat /proc/cpuinfo
       - name: Install NNCF and test requirements
         run: pip install -e . -r tests/post_training/requirements.txt
+      - name: Install OV RC
+        run: pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
       - name: Print installed modules
         run: pip list
       - name: Run examples test scope

@@ -10,8 +10,14 @@
 # limitations under the License.
 
 from nncf.openvino.optimized_functions.functions import astype as astype
-from nncf.openvino.optimized_functions.functions import do_int_quantization as do_int_quantization
+from nncf.openvino.optimized_functions.functions import do_float_quantization as do_float_quantization
+from nncf.openvino.optimized_functions.functions import do_integer_quantization as do_integer_quantization
+from nncf.openvino.optimized_functions.functions import (
+    float_quantize_dequantize_weight as float_quantize_dequantize_weight,
+)
 from nncf.openvino.optimized_functions.functions import get_integer_quantization_error as get_integer_quantization_error
-from nncf.openvino.optimized_functions.functions import quantize_dequantize_weight as quantize_dequantize_weight
+from nncf.openvino.optimized_functions.functions import (
+    integer_quantize_dequantize_weight as integer_quantize_dequantize_weight,
+)
 from nncf.openvino.optimized_functions.models import OVModelParameters as OVModelParameters
 from nncf.openvino.optimized_functions.models import clear_ov_model_cache as clear_ov_model_cache
@@ -11,13 +11,16 @@
 
 from typing import Optional, Tuple, Union
 
+from nncf import CompressWeightsMode
 from nncf.common.utils.caching import disable_results_caching
 from nncf.openvino.optimized_functions.models import OV_MODEL_CACHE
 from nncf.openvino.optimized_functions.models import OVModelParameters
 from nncf.openvino.optimized_functions.models import get_astype_model
-from nncf.openvino.optimized_functions.models import get_compress_decompress_weight_model
-from nncf.openvino.optimized_functions.models import get_compress_weight_model
-from nncf.openvino.optimized_functions.models import get_quantization_error_model
+from nncf.openvino.optimized_functions.models import get_float_quantization_model
+from nncf.openvino.optimized_functions.models import get_float_quantize_dequantize_weight_model
+from nncf.openvino.optimized_functions.models import get_integer_quantization_error_model
+from nncf.openvino.optimized_functions.models import get_integer_quantization_model
+from nncf.openvino.optimized_functions.models import get_integer_quantize_dequantize_weight_model
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization
 from nncf.tensor import Tensor
@@ -27,7 +30,7 @@
 ReductionAxes = Union[int, Tuple[int, ...]]
 
 
-def do_int_quantization(
+def do_integer_quantization(
     weight: Tensor,
     config: WeightCompressionConfig,
     reduction_axes: Optional[ReductionAxes] = None,
@@ -63,7 +66,7 @@ def do_int_quantization(
             {"compressed_weight": compressed_weight_dtype, "zero_point": compressed_weight_dtype}
         )
 
-    model = get_compress_weight_model(
+    model = get_integer_quantization_model(
         ov_model_params,
         config,
         weight_shape,
@@ -97,7 +100,49 @@ def do_int_quantization(
     return compressed_weight, scale, zero_point
 
 
-def quantize_dequantize_weight(
+def do_float_quantization(
+    weight: Tensor,
+    config: WeightCompressionConfig,
+    reduction_axes: Optional[ReductionAxes] = None,
+    precomputed_scale: Tensor = None,
+) -> Tuple[Tensor, Tensor]:
+    weight_shape = weight.shape
+    scale_shape = None if precomputed_scale is None else precomputed_scale.shape
+
+    ov_model_params = OVModelParameters()
+    ov_model_params.input_dtypes["weight"] = weight.dtype
+    if precomputed_scale is not None:
+        ov_model_params.input_dtypes["scale"] = precomputed_scale.dtype
+    if config.num_bits == 4 and weight.backend == TensorBackend.ov:
+        # Return ov tensors in target precision to seamlessly insert them into openvino model later
+        ov_model_params.return_ov_tensors = True
+        dtype = TensorDataType.nf4 if config.mode == CompressWeightsMode.NF4 else TensorDataType.f4e2m1
+        ov_model_params.output_dtypes.update({"compressed_weight": dtype})
+
+    model = get_float_quantization_model(
+        ov_model_params,
+        config,
+        weight_shape,
+        scale_shape,
+        reduction_axes,
+    )
+
+    if precomputed_scale is None:
+        # weight -> compressed_weight, scale
+        compressed_weight, scale = model([weight])
+
+        # Scale is always in fp32 so there is no need to store it in ov.Tensor
+        if scale.backend == TensorBackend.ov:
+            scale = scale.as_numpy_tensor()
+    else:
+        # weight, scale -> compressed_weight
+        compressed_weight = model([weight, precomputed_scale])[0]
+        scale = precomputed_scale
+
+    return compressed_weight, scale
+
+
+def integer_quantize_dequantize_weight(
     weight: Tensor,
     config: WeightCompressionConfig,
     reduction_axes: Optional[ReductionAxes] = None,
@@ -135,7 +180,7 @@ def quantize_dequantize_weight(
     if precomputed_zero_point is not None:
         ov_model_params.input_dtypes["zero_point"] = precomputed_zero_point.dtype
 
-    model = get_compress_decompress_weight_model(
+    model = get_integer_quantize_dequantize_weight_model(
         ov_model_params, config, weight_shape, scale_shape, zero_point_shape, reduction_axes, return_compressed_weight
     )
 
@@ -161,6 +206,48 @@ def quantize_dequantize_weight(
         return decompressed_weight
 
 
+def float_quantize_dequantize_weight(
+    weight: Tensor,
+    config: WeightCompressionConfig,
+    reduction_axes: Optional[ReductionAxes] = None,
+    precomputed_scale: Optional[Tensor] = None,
+    return_compressed_weight: Optional[bool] = False,
+) -> Union[Tensor, Tuple[Tensor, Tensor, Tensor]]:
+    # When reduction axes are not provided, assuming that the weights are already reshaped
+    if config.group_size != -1 and reduction_axes is not None:
+        # weights are reshaped from [a1, r, a2] to [a1, r//gs, gs, a2]
+        weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, config.group_size)
+
+    weight_shape = weight.shape
+    scale_shape = precomputed_scale.shape if precomputed_scale is not None else None
+
+    ov_model_params = OVModelParameters()
+    ov_model_params.input_dtypes["weight"] = weight.dtype
+    if precomputed_scale is not None:
+        ov_model_params.input_dtypes["scale"] = precomputed_scale.dtype
+
+    model = get_float_quantize_dequantize_weight_model(
+        ov_model_params, config, weight_shape, scale_shape, reduction_axes, return_compressed_weight
+    )
+
+    inputs = [weight]
+    if precomputed_scale is not None:
+        inputs.append(precomputed_scale)
+
+    compressed_weight, scale = None, precomputed_scale
+    results = model(inputs)
+    if len(results) == 1:
+        decompressed_weight = results[0]
+    elif len(results) == 2:
+        decompressed_weight, compressed_weight = results
+    else:
+        decompressed_weight, compressed_weight, scale = results
+    if return_compressed_weight:
+        return decompressed_weight, compressed_weight, scale
+    else:
+        return decompressed_weight
+
+
 def get_integer_quantization_error(
     weight: Tensor,
     reduction_axes: ReductionAxes,
@@ -188,7 +275,7 @@ def get_integer_quantization_error(
 
     ov_model_params = OVModelParameters()
     ov_model_params.input_dtypes["weight"] = weight.dtype
-    model = get_quantization_error_model(
+    model = get_integer_quantization_error_model(
         ov_model_params, config, original_weight_shape, weight.shape, original_reduction_axes, reduction_axes
     )