Fixed weight compression for float16/bfloat16 Torch models (#3330)

ljaljushkin · web-flow · commit d5de30d0a715 · 2025-03-06T17:10:42.000+01:00
### Changes Adapted AWQ and Scale Estimation algorithm for the case when weights and activations are float16 and bfloat16. ### Reason for changes Otherwise, compression fails with errors like that: ` RuntimeError: expected mat1 and mat2 to have the same dtype, but got: c10::BFloat16 != float ` ### Related tickets n/a ### Tests - tests/torch/ptq/test_weights_compression.py::test_half_precision_models - PTWC https://github.com/openvinotoolkit/nncf/actions/runs/13680175191 - PTWC Performance | 51 build on develop | 52 build on PR | :-------------------------:|:-------------------------: ![image](https://github.com/user-attachments/assets/3d5b9c96-cf4c-47a2-89e8-f1b6b4f48113) | ![image](https://github.com/user-attachments/assets/0b943e42-f407-4cd5-9e4a-3215561ea9e9)
diff --git a/nncf/quantization/algorithms/weight_compression/awq.py b/nncf/quantization/algorithms/weight_compression/awq.py
@@ -201,6 +201,8 @@ def apply(
             config = wp.compression_config
 
             s, X = process_stats(statistics[k], self._subset_size)
+            s = s.astype(TensorDataType.float32)
+            X = X.astype(TensorDataType.float32)
 
             top_k = max(int(s.shape[0] * self._percent_to_apply), 1)
             topk_idxs = fns.argsort(-s)[:top_k]
@@ -218,6 +220,8 @@ def apply(
             weight = self._backend_entity.get_weight(
                 wp.node_with_weight, weight_port_id, model, graph
             )  # get_const_value(wp.weight_node)
+            weight_dtype = weight.dtype
+            weight = weight.astype(TensorDataType.float32)
             assert isinstance(wp.reduction_axes, tuple) and len(wp.reduction_axes) == 1
             reduction_axis = wp.reduction_axes[0]
 
@@ -279,19 +283,19 @@ def apply(
                 w_scale = fns.unsqueeze(w_scale, 0)
                 a_scale = fns.unsqueeze(1.0 / a_scale, 1)
 
-            scaled_weight = weight * w_scale
+            scaled_weight = (weight * w_scale).astype(weight_dtype)
             self._backend_entity.set_weight(wp.node_with_weight, weight_port_id, model, graph, scaled_weight)
 
             if self._backend_entity.is_node_with_weights(
                 merge_node, graph
             ):  # for MatMul->Multiply->MatMul pattern scale merged to first MatMul
                 for _, port_id in self._backend_entity.get_weight_names_and_port_ids(merge_node, graph):
                     merge_weight = self._backend_entity.get_weight(merge_node, port_id, model, graph)
-                    merge_weight = merge_weight * a_scale
+                    merge_weight = (merge_weight * a_scale).astype(weight_dtype)
                     self._backend_entity.set_weight(merge_node, port_id, model, graph, merge_weight)
                 a_scale = fns.transpose(a_scale)
             else:  # for Act->Multiply->MatMul and Act->MatMul patterns scale inserted after Act as extra node
-                a_scale = fns.transpose(a_scale)
+                a_scale = fns.transpose(a_scale).astype(weight_dtype)
                 next_nodes = graph.get_next_nodes(merge_node)
                 source_node_output_port = graph.get_output_edges(merge_node)[0].output_port_id
                 scale_insertion_command = self._backend_entity.scale_insertion_command(
diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
@@ -185,6 +185,7 @@ def calculate_quantization_params(
 
         s, X = process_stats(statistics, subset_size)
 
+        X = X.astype(TensorDataType.float32)
         weight = weight.astype(TensorDataType.float32)
         eps = fns.finfo(weight).eps
 
diff --git a/tests/torch/ptq/test_weights_compression.py b/tests/torch/ptq/test_weights_compression.py
@@ -15,6 +15,8 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from transformers import AutoModelForCausalLM
+from transformers import AutoTokenizer
 
 import nncf
 from nncf import BackupMode
@@ -436,6 +438,22 @@ def test_pack_int4():
     assert torch.all(unpacked_w == w_int8)
 
 
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
+def test_half_precision_models(dtype):
+    model_id = "hf-internal-testing/tiny-random-OPTForCausalLM"
+    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype)
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    inputs = tokenizer("dummy_input", return_tensors="pt")
+    compress_weights(
+        model,
+        group_size=2,
+        mode=CompressWeightsMode.INT4_SYM,
+        scale_estimation=True,
+        awq=True,
+        dataset=nncf.Dataset([dict(inputs)]),
+    )
+
+
 class TestPTTemplateWeightCompression(TemplateWeightCompression):
     @staticmethod
     def get_matmul_model() -> torch.nn.Module: