strip in one commit

ljaljushkin · ljaljushkin · commit 47dbe38595fc · 2025-03-17T14:00:04.000+01:00
diff --git a/nncf/torch/quantization/strip.py b/nncf/torch/quantization/strip.py
@@ -10,16 +10,37 @@
 # limitations under the License.
 
 
+from typing import List
+
 import numpy as np
 import torch
 from torch.quantization.fake_quantize import FakeQuantize
 
 import nncf
+from nncf.common.graph.transformations.commands import Command
+from nncf.common.graph.transformations.commands import TargetType
+from nncf.common.graph.transformations.layout import TransformationLayout
+from nncf.experimental.common.check_feature import is_experimental_torch_tracing_enabled
+from nncf.experimental.torch2.commands import PT2InsertionCommand
+from nncf.torch.dynamic_graph.scope import Scope
 from nncf.torch.graph.transformations.commands import ExtraCompressionModuleType
+from nncf.torch.graph.transformations.commands import PTSharedFnInsertionCommand
+from nncf.torch.graph.transformations.commands import PTTargetPoint
+from nncf.torch.model_graph_manager import get_const_node
+from nncf.torch.model_graph_manager import get_module_by_name
+from nncf.torch.model_graph_manager import split_const_name
+from nncf.torch.model_transformer import PTModelTransformer
 from nncf.torch.nncf_network import NNCFNetwork
+from nncf.torch.quantization.layers import AsymmetricLoraQuantizer
 from nncf.torch.quantization.layers import AsymmetricQuantizer
 from nncf.torch.quantization.layers import BaseQuantizer
+from nncf.torch.quantization.layers import INT4AsymmetricWeightsDecompressor
+from nncf.torch.quantization.layers import INT4SymmetricWeightsDecompressor
+from nncf.torch.quantization.layers import INT8AsymmetricWeightsDecompressor
+from nncf.torch.quantization.layers import INT8SymmetricWeightsDecompressor
+from nncf.torch.quantization.layers import SymmetricLoraQuantizer
 from nncf.torch.quantization.layers import SymmetricQuantizer
+from nncf.torch.quantization.quantize_functions import TuneRange
 
 SUPPORTED_NUM_BITS_FOR_STRIP_MODEL = [8]
 
@@ -171,6 +192,153 @@ def strip_quantized_model(model: NNCFNetwork):
     :param model: Compressed model.
     :return: The modified NNCF network.
     """
-    model = replace_quantizer_to_torch_native_module(model)
-    model = remove_disabled_quantizers(model)
+    model_layout = model.nncf.transformation_layout()
+    transformations = model_layout.transformations
+    if any([type(q.fn) in [AsymmetricLoraQuantizer, SymmetricLoraQuantizer] for q in transformations]):
+        model = replace_with_decompressors(model, transformations)
+    else:
+        model = replace_quantizer_to_torch_native_module(model)
+        model = remove_disabled_quantizers(model)
     return model
+
+
+def replace_with_decompressors(model: NNCFNetwork, transformations: List[Command]) -> NNCFNetwork:
+    """
+    Performs transformation from fake quantize format (FQ) to dequantization one (DQ).
+    The former takes floating-point input, quantizes and dequantizes, and returns a floating-point value,
+    while the latter takes a quantized integer representation, dequantizes it, and outputs a floating-point result.
+
+    Mathematically, both methods lead to the same outcome, but due to differences in the order of operations and
+    rounding errors, the actual results may differ. In particular, this error can occur for values
+    that are located in the midpoint between two quantized values ("quants").
+
+    The FQ format may round these values to one "quant", while the DQ format rounds them to another "quant".
+    To avoid these issues, the compressed representation should be provided not by directly quantizing the input,
+    but by quantizing a pre-processed, fake-quantized, floating-point representation.
+
+    :param model: Compressed model with Decompressors.
+    :return: The modified NNCF network.
+    """
+    transformation_layout = TransformationLayout()
+    model = model.nncf.get_clean_shallow_copy()
+    graph = model.nncf.get_graph()
+
+    for command in transformations:
+        quantizer = command.fn
+
+        if len(command.target_points) > 1:
+            msg = "Command contains more than one target point!"
+            raise nncf.ValidationError(msg)
+
+        tp = command.target_points[0]
+        node_with_weight = graph.get_node_by_name(tp.target_node_name)
+        weight_node = get_const_node(node_with_weight, tp.input_port_id, graph)
+
+        module_name, weight_attr_name = split_const_name(weight_node.layer_attributes.name)
+        module = get_module_by_name(module_name, model)
+        original_weight = getattr(module, weight_attr_name)
+
+        original_dtype = original_weight.dtype
+        original_shape = original_weight.shape
+        original_eps = torch.finfo(original_dtype).eps
+
+        qdq_weight = quantizer.quantize(original_weight)
+        if hasattr(quantizer, "_lspec"):
+            # Special reshape for LoRA-grouped output
+            qdq_weight = qdq_weight.reshape(quantizer._lspec.weight_shape)
+        qdq_weight = qdq_weight.to(original_dtype)
+
+        if isinstance(quantizer, AsymmetricQuantizer):
+            input_range_safe = abs(quantizer.input_range) + quantizer.eps
+            input_low, input_range = TuneRange.apply(quantizer.input_low, input_range_safe, quantizer.levels)
+
+            integer_dtype = torch.uint8
+
+            input_low = input_low.to(original_dtype)
+            input_range = input_range.to(original_dtype)
+
+            scale = input_range / quantizer.level_high
+            scale = torch.where(torch.abs(scale) < original_eps, original_eps, scale)
+            scale = scale.to(original_dtype)
+
+            zero_point = quantizer.level_low - torch.round(input_low / scale)
+            zero_point = torch.clip(zero_point, quantizer.level_low, quantizer.level_high)
+            zero_point = zero_point.to(integer_dtype)
+
+            q_weight = qdq_weight / scale
+            q_weight = q_weight + zero_point
+            q_weight = torch.round(q_weight)
+            q_weight = torch.clip(q_weight, quantizer.level_low, quantizer.level_high)
+            q_weight = q_weight.to(integer_dtype)
+
+            if quantizer.num_bits == 8:
+                decompressor = INT8AsymmetricWeightsDecompressor(
+                    scale=scale, zero_point=zero_point, result_dtype=original_dtype
+                )
+            else:
+                decompressor = INT4AsymmetricWeightsDecompressor(
+                    scale=scale,
+                    zero_point=zero_point,
+                    compressed_weight_shape=q_weight.shape,
+                    result_shape=original_shape,
+                    result_dtype=original_dtype,
+                )
+
+        elif isinstance(quantizer, SymmetricQuantizer):
+            integer_dtype = torch.int8
+
+            scale = quantizer.scale / abs(quantizer.level_low)
+            scale = torch.where(torch.abs(scale) < original_eps, original_eps, scale)
+            scale = scale.to(original_dtype)
+
+            q_weight = qdq_weight / scale
+            q_weight = torch.round(q_weight)
+            q_weight = torch.clip(q_weight, quantizer.level_low, quantizer.level_high)
+            q_weight = q_weight.to(integer_dtype)
+
+            if quantizer.num_bits == 8:
+                decompressor = INT8SymmetricWeightsDecompressor(scale=scale, result_dtype=original_dtype)
+            else:
+                decompressor = INT4SymmetricWeightsDecompressor(
+                    scale=scale,
+                    compressed_weight_shape=q_weight.shape,
+                    result_shape=original_shape,
+                    result_dtype=original_dtype,
+                )
+
+        packed_tensor = decompressor.pack_weight(q_weight)
+
+        # sets compressed tensor
+        compressed_parameter = torch.nn.Parameter(packed_tensor, requires_grad=False)
+        setattr(module, weight_attr_name, compressed_parameter)
+
+        consumer_nodes = graph.get_next_nodes(weight_node)
+        if len(consumer_nodes) > 1:
+            for consumer_node in consumer_nodes:
+                consumer_module = model.nncf.get_module_by_scope(Scope.from_str(consumer_node.layer_name))
+                for name, param in consumer_module.named_parameters(recurse=False, remove_duplicate=False):
+                    if id(param) == id(original_weight):
+                        setattr(consumer_module, name, compressed_parameter)
+
+        if is_experimental_torch_tracing_enabled():
+            transformation_layout.register(
+                PT2InsertionCommand(
+                    [
+                        PTTargetPoint(
+                            TargetType.OPERATOR_POST_HOOK, target_node_name=weight_node.node_name.replace(".", ":")
+                        )
+                    ],
+                    decompressor,
+                )
+            )
+        else:
+            decompressor_name = f"weights_decompressor_{weight_node.node_name.replace('.', '_')}"
+            transformation_layout.register(
+                PTSharedFnInsertionCommand(
+                    [PTTargetPoint(TargetType.OPERATOR_POST_HOOK, target_node_name=weight_node.node_name)],
+                    decompressor,
+                    decompressor_name,
+                )
+            )
+
+    return PTModelTransformer(model).transform(transformation_layout)
diff --git a/tests/torch/helpers.py b/tests/torch/helpers.py
@@ -773,3 +773,14 @@ def _check_pre_post_hooks(
             assert len(actual_hooks) == len(ref_hooks)
             for actual_hook, ref_hook in zip(actual_hooks, ref_hooks):
                 assert actual_hook is ref_hook
+
+
+class LinearModel(nn.Module):
+    def __init__(self, input_shape=List[int]):
+        super().__init__()
+        with set_torch_seed():
+            self.linear = nn.Linear(input_shape[1], input_shape[0], bias=False)
+            self.linear.weight.data = torch.randn(input_shape) - 0.5
+
+    def forward(self, x):
+        return self.linear(x)
diff --git a/tests/torch/ptq/test_fq_lora.py b/tests/torch/ptq/test_fq_lora.py
@@ -11,6 +11,10 @@
 
 import pytest
 import torch
+from optimum.exporters.openvino.convert import export_from_model
+from optimum.intel.openvino import OVModelForCausalLM
+from sentence_transformers import SentenceTransformer
+from sentence_transformers import util
 from transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
@@ -20,6 +24,44 @@
 from nncf.torch.quantization.layers import SymmetricQuantizer as SQ
 
 
+class ValidationMock:
+    def __init__(self) -> None:
+        model_id = "sentence-transformers/all-mpnet-base-v2"
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+        self.model = SentenceTransformer(
+            model_id, tokenizer_kwargs={"pad_token": self.tokenizer.pad_token}, trust_remote_code=True
+        )
+
+    def calculate_similarity(self, gold: str, prediction: str) -> torch.Tensor:
+        embeddings = self.model.encode([gold, prediction])
+        cos_sim = util.cos_sim(embeddings, embeddings)
+        return torch.mean(cos_sim)
+
+    @property
+    def validation_ref(self) -> torch.Tensor:
+        return torch.tensor(1.0)
+
+
+def generate_control_output(model: AutoModelForCausalLM, tokenizer: AutoTokenizer) -> torch.Tensor:
+    control_input = tokenizer("What is Pytorch?", return_tensors="pt")
+    control_input = control_input.to(model.device)
+    control_output = model.generate(**control_input, do_sample=False)
+    return tokenizer.batch_decode(control_output, skip_special_tokens=True)[0]
+
+
+def get_ov_model(model: AutoModelForCausalLM, tmp_path: str) -> OVModelForCausalLM:
+    model = model.cpu()
+    export_from_model(model, tmp_path)
+
+    return OVModelForCausalLM.from_pretrained(
+        model_id=tmp_path,
+        trust_remote_code=True,
+        load_in_8bit=False,
+        compile=True,
+        ov_config={"KV_CACHE_PRECISION": "f16", "DYNAMIC_QUANTIZATION_GROUP_SIZE": "0"},
+    )
+
+
 @pytest.mark.parametrize(
     "compression_kwargs",
     (dict(scale_estimation=True, awq=True), dict(scale_estimation=False, awq=False)),
@@ -33,7 +75,7 @@
     ),
     ids=["asym", "sym"],
 )
-def test_fq_lora_tuning(mode, backup_mode, compression_kwargs, ref_num_trainable, _seed):
+def test_fq_lora_tuning(tmp_path, mode, backup_mode, compression_kwargs, ref_num_trainable, _seed):
     model_id = "facebook/opt-125m"
     device = "cuda" if torch.cuda.is_available() else "cpu"
     model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map=device)
@@ -80,3 +122,21 @@ def test_fq_lora_tuning(mode, backup_mode, compression_kwargs, ref_num_trainable
 
     assert first_loss > 8
     assert float(loss) < 1
+
+    tuned_output = generate_control_output(model, tokenizer)
+
+    # Workaround till export from the optimum would be fixed - CVS-164159
+    model = model.to(torch.float32)
+
+    model = nncf.strip(model)
+    stripped_output = generate_control_output(model, tokenizer)
+
+    model = get_ov_model(model, tmp_path)
+    stripped_ov_output = generate_control_output(model, tokenizer)
+
+    vm = ValidationMock()
+    tuned_vs_stripped = vm.calculate_similarity(tuned_output, stripped_output)
+    tuned_vs_stripped_ov = vm.calculate_similarity(tuned_output, stripped_ov_output)
+
+    assert torch.allclose(tuned_vs_stripped, vm.validation_ref, atol=0.01)
+    assert torch.allclose(tuned_vs_stripped_ov, vm.validation_ref, atol=0.01)
diff --git a/tests/torch/quantization/test_strip.py b/tests/torch/quantization/test_strip.py
@@ -34,6 +34,7 @@
 from tests.common.quantization.data_generators import generate_sweep_data
 from tests.common.quantization.data_generators import get_quant_len_by_range
 from tests.torch.helpers import BasicConvTestModel
+from tests.torch.helpers import LinearModel
 from tests.torch.helpers import create_compressed_model_and_algo_for_test
 from tests.torch.helpers import register_bn_adaptation_init_args
 from tests.torch.quantization.test_functions import get_test_data
@@ -325,3 +326,41 @@ def test_nncf_strip_api(strip_type, do_copy):
 
     assert isinstance(strip_model.conv.get_pre_op("0").op, FakeQuantize)
     assert isinstance(strip_model.nncf.external_quantizers["/nncf_model_input_0|OUTPUT"], FakeQuantize)
+
+
+@pytest.mark.parametrize(
+    ("mode", "torch_dtype", "atol"),
+    (
+        (nncf.CompressWeightsMode.INT4_ASYM, torch.float32, 0.0005),
+        (nncf.CompressWeightsMode.INT4_ASYM, torch.float16, 0.0005),
+        (nncf.CompressWeightsMode.INT4_ASYM, torch.bfloat16, 0.01),
+        (nncf.CompressWeightsMode.INT4_SYM, torch.float32, 0.0005),
+        (nncf.CompressWeightsMode.INT4_SYM, torch.float16, 0.0005),
+        (nncf.CompressWeightsMode.INT4_SYM, torch.bfloat16, 0.01),
+    ),
+)
+def test_nncf_strip_lora_model(mode, torch_dtype, atol):
+    input_shape = [1, 16]
+    model = LinearModel(input_shape=input_shape)
+    model = model.to(torch_dtype)
+    with torch.no_grad():
+        example = torch.ones(input_shape).to(torch_dtype)
+        dataset = [example]
+
+        compressed_model = nncf.compress_weights(
+            model,
+            ratio=1,
+            group_size=4,
+            mode=mode,
+            backup_mode=None,
+            dataset=nncf.Dataset(dataset),
+            all_layers=True,
+            compression_format=nncf.CompressionFormat.FQ_LORA,
+        )
+
+        compressed_output = compressed_model(example)
+
+        strip_compressed_model = nncf.strip(compressed_model, do_copy=True)
+        stripped_output = strip_compressed_model(example)
+
+        assert torch.allclose(compressed_output, stripped_output, atol=atol)
diff --git a/tests/torch/requirements.txt b/tests/torch/requirements.txt
@@ -24,3 +24,7 @@ timm==0.9.2
 # Required for torch/fx tests
 torchvision
 fastdownload==0.0.7
+
+sentence-transformers>=2.2.2
+optimum-intel==1.22.0
+optimum==1.24.0