Skip to content

Commit 73590b0

Browse files
authoredMar 10, 2025··
Use regular division inside Scale Estimation (#3210)
### Changes Compute division inside SE algorithm always as `a/b` instead of `a*(1/b)` in some cases. ### Reason for changes During implementation #2727 some choices were made regarding how division operation is computed in order for the changes to be completely aligned with the previous implementation. Namely, before #2727 some divisions were computed as `a*(1/b)`, and this is currently still the case. The way these divisions are computed originally was not intended. Now, all divisions are aligned to the `a/b` form. Compression time and memory are roughly the same. | Model | Compression | Compression Time Develop (sec.) | Compression Time Branch (sec.) | Peak Memory Develop (MiB) | Peak Memory Branch (MiB) | |--------------|--------------|---------------------------------|--------------------------------|---------------------------|--------------------------| | tiny-llama | int4, SE | 222* | 228* | 3030 | 3032 | | phi4-mini | in4, SE | 789* | 790* | 10817 | 10768 | | llama-3.1-8b | int4, SE | 1776* | 1801* | 17756 | 18224 | *time column includes PT -> OV conversion time. ### Related tickets 163286 ### Tests - https://github.com/openvinotoolkit/nncf/actions/runs/13368886294 - NNCF/job/manual/job/post_training_weight_compression/324/ - OVVP validation ✅
1 parent 64d8468 commit 73590b0

File tree

5 files changed

+6
-27
lines changed

5 files changed

+6
-27
lines changed
 

‎nncf/openvino/optimized_functions/functions.py

+2-10
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ def do_int_quantization(
3333
reduction_axes: Optional[ReductionAxes] = None,
3434
precomputed_scale: Tensor = None,
3535
precomputed_zero_point: Tensor = None,
36-
**kwargs,
3736
) -> Tuple[Tensor, Tensor, Tensor]:
3837
"""
3938
Quantizes the given weight tensor.
@@ -50,10 +49,7 @@ def do_int_quantization(
5049
scale_shape = None if precomputed_scale is None else precomputed_scale.shape
5150
zero_point_shape = None if precomputed_zero_point is None else precomputed_zero_point.shape
5251

53-
ov_model_params = OVModelParameters(
54-
dynamic_shapes=kwargs.get("dynamic_shapes") is True,
55-
convertable_division=kwargs.get("convertable_division") is True,
56-
)
52+
ov_model_params = OVModelParameters()
5753
ov_model_params.input_dtypes["weight"] = weight.dtype
5854
if precomputed_scale is not None:
5955
ov_model_params.input_dtypes["scale"] = precomputed_scale.dtype
@@ -108,7 +104,6 @@ def quantize_dequantize_weight(
108104
precomputed_scale: Optional[Tensor] = None,
109105
precomputed_zero_point: Optional[Tensor] = None,
110106
return_compressed_weight: Optional[bool] = False,
111-
**kwargs,
112107
) -> Union[Tensor, Tuple[Tensor, Tensor, Tensor, Tensor]]:
113108
"""
114109
Quantizes the given weight tensor and then dequantizes it back to obtain float32 values.
@@ -133,10 +128,7 @@ def quantize_dequantize_weight(
133128
scale_shape = precomputed_scale.shape if precomputed_scale is not None else None
134129
zero_point_shape = precomputed_zero_point.shape if precomputed_zero_point is not None else None
135130

136-
ov_model_params = OVModelParameters(
137-
dynamic_shapes=kwargs.get("dynamic_shapes") is True,
138-
convertable_division=kwargs.get("convertable_division") is True,
139-
)
131+
ov_model_params = OVModelParameters()
140132
ov_model_params.input_dtypes["weight"] = weight.dtype
141133
if precomputed_scale is not None:
142134
ov_model_params.input_dtypes["scale"] = precomputed_scale.dtype

‎nncf/quantization/algorithms/weight_compression/scale_estimation.py

-8
Original file line numberDiff line numberDiff line change
@@ -246,10 +246,6 @@ def calculate_quantization_params(
246246
zero_scale = 0.001
247247
zero_mask = zero_scale * zero_mask.astype(original_weight.dtype)
248248

249-
# This is required for alignment with a previous OpenVINO models implementation
250-
# TODO(Nikita Savelyev): remove this
251-
opt_fns_kwargs = dict(dynamic_shapes=False, convertable_division=True)
252-
253249
# iterative rectification of initial scale
254250
for i in range(initial_steps):
255251
near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance)
@@ -264,7 +260,6 @@ def calculate_quantization_params(
264260
config,
265261
precomputed_scale=near_to_ideal_scale,
266262
precomputed_zero_point=zp,
267-
**opt_fns_kwargs,
268263
)
269264

270265
q_weights_ = fns.zeros_like(original_weight) + out
@@ -299,7 +294,6 @@ def calculate_quantization_params(
299294
config,
300295
precomputed_scale=near_to_ideal_scale,
301296
precomputed_zero_point=zp,
302-
**opt_fns_kwargs,
303297
)
304298
compressed_weights = fns.zeros_like(original_weight) + out
305299
target, zero_mask = get_target_zero_mask(compressed_weights, zp)
@@ -318,7 +312,6 @@ def calculate_quantization_params(
318312
config,
319313
precomputed_scale=scaled_scale,
320314
precomputed_zero_point=zp,
321-
**opt_fns_kwargs,
322315
)
323316
compressed_weights = fns.zeros_like(original_weight) + out
324317

@@ -336,7 +329,6 @@ def calculate_quantization_params(
336329
config,
337330
precomputed_scale=near_to_ideal_scale,
338331
precomputed_zero_point=zp,
339-
**opt_fns_kwargs,
340332
)
341333
q_weights_ = fns.zeros_like(original_weight) + out
342334

‎nncf/quantization/algorithms/weight_compression/weight_lowering.py

+1-6
Original file line numberDiff line numberDiff line change
@@ -443,7 +443,6 @@ def do_int_quantization(
443443
reduction_axes: Optional[ReductionAxes] = None,
444444
precomputed_scale: Tensor = None,
445445
precomputed_zero_point: Tensor = None,
446-
**kwargs,
447446
) -> Tuple[Tensor, Tensor, Tensor]:
448447
"""
449448
Performs integer quantization on the given weight tensor.
@@ -475,9 +474,7 @@ def do_int_quantization(
475474
if _can_run_optimized(weight.backend):
476475
from nncf.openvino.optimized_functions import do_int_quantization as do_int_quantization_ov
477476

478-
return do_int_quantization_ov(
479-
weight, config, reduction_axes, precomputed_scale, precomputed_zero_point, **kwargs
480-
)
477+
return do_int_quantization_ov(weight, config, reduction_axes, precomputed_scale, precomputed_zero_point)
481478

482479
# Reference implementation
483480
if weight.backend == TensorBackend.ov:
@@ -507,7 +504,6 @@ def quantize_dequantize_weight(
507504
precomputed_scale: Optional[Tensor] = None,
508505
precomputed_zero_point: Optional[Tensor] = None,
509506
return_compressed_weight: Optional[bool] = False,
510-
**kwargs,
511507
) -> Union[Tensor, Tuple[Tensor, Tensor, Tensor, Tensor]]:
512508
"""
513509
First quantizes the given weight tensor and then dequantizes it back to obtain float32 values.
@@ -533,7 +529,6 @@ def quantize_dequantize_weight(
533529
precomputed_scale,
534530
precomputed_zero_point,
535531
return_compressed_weight,
536-
**kwargs,
537532
)
538533

539534
# Reference implementation

‎tests/post_training/data/wc_reference_data.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,11 @@ tinyllama_data_aware_gptq_scale_estimation_stateful_backend_OV:
3636
num_int8: 124
3737
metrics_xfail_reason: "Issue-148819"
3838
tinyllama_scale_estimation_per_channel_backend_OV:
39-
metric_value: 0.81389
39+
metric_value: 0.80873
4040
num_int4: 188
4141
num_int8: 124
4242
tinyllama_scale_estimation_per_channel_backend_TORCH:
43-
metric_value: 0.81389
43+
metric_value: 0.80873
4444
num_int4: 188
4545
num_int8: 124
4646
atol: 0.006 # difference across devices: 0.80873 vs 0.81389

‎tests/post_training/data/wc_reference_data_2025.0.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ tinyllama_data_aware_backend_TORCH:
1414
num_int8: 124
1515
num_compressed_xfail_reason: "Issue-160006"
1616
tinyllama_scale_estimation_per_channel_backend_TORCH:
17-
metric_value: 0.81389
17+
metric_value: 0.80873
1818
num_int4: 188
1919
num_int8: 124
2020
atol: 0.006 # difference across devices: 0.80873 vs 0.81389

0 commit comments

Comments
 (0)
Please sign in to comment.