Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use regular division inside Scale Estimation #3210

Merged
12 changes: 2 additions & 10 deletions nncf/openvino/optimized_functions/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ def do_int_quantization(
reduction_axes: Optional[ReductionAxes] = None,
precomputed_scale: Tensor = None,
precomputed_zero_point: Tensor = None,
**kwargs,
) -> Tuple[Tensor, Tensor, Tensor]:
"""
Quantizes the given weight tensor.
Expand All @@ -49,10 +48,7 @@ def do_int_quantization(
scale_shape = None if precomputed_scale is None else precomputed_scale.shape
zero_point_shape = None if precomputed_zero_point is None else precomputed_zero_point.shape

ov_model_params = OVModelParameters(
dynamic_shapes=kwargs.get("dynamic_shapes") is True,
convertable_division=kwargs.get("convertable_division") is True,
)
ov_model_params = OVModelParameters()
ov_model_params.input_dtypes["weight"] = weight.dtype
if precomputed_scale is not None:
ov_model_params.input_dtypes["scale"] = precomputed_scale.dtype
Expand Down Expand Up @@ -107,7 +103,6 @@ def quantize_dequantize_weight(
precomputed_scale: Optional[Tensor] = None,
precomputed_zero_point: Optional[Tensor] = None,
return_compressed_weight: Optional[bool] = False,
**kwargs,
) -> Union[Tensor, Tuple[Tensor, Tensor, Tensor, Tensor]]:
"""
Quantizes the given weight tensor and then dequantizes it back to obtain float32 values.
Expand All @@ -132,10 +127,7 @@ def quantize_dequantize_weight(
scale_shape = precomputed_scale.shape if precomputed_scale is not None else None
zero_point_shape = precomputed_zero_point.shape if precomputed_zero_point is not None else None

ov_model_params = OVModelParameters(
dynamic_shapes=kwargs.get("dynamic_shapes") is True,
convertable_division=kwargs.get("convertable_division") is True,
)
ov_model_params = OVModelParameters()
ov_model_params.input_dtypes["weight"] = weight.dtype
if precomputed_scale is not None:
ov_model_params.input_dtypes["scale"] = precomputed_scale.dtype
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -245,10 +245,6 @@ def calculate_quantization_params(
zero_scale = 0.001
zero_mask = zero_scale * zero_mask.astype(original_weight.dtype)

# This is required for alignment with a previous OpenVINO models implementation
# TODO(Nikita Savelyev): remove this
opt_fns_kwargs = dict(dynamic_shapes=False, convertable_division=True)

# iterative rectification of initial scale
for i in range(initial_steps):
near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance)
Expand All @@ -263,7 +259,6 @@ def calculate_quantization_params(
config,
precomputed_scale=near_to_ideal_scale,
precomputed_zero_point=zp,
**opt_fns_kwargs,
)

q_weights_ = fns.zeros_like(original_weight) + out
Expand Down Expand Up @@ -298,7 +293,6 @@ def calculate_quantization_params(
config,
precomputed_scale=near_to_ideal_scale,
precomputed_zero_point=zp,
**opt_fns_kwargs,
)
compressed_weights = fns.zeros_like(original_weight) + out
target, zero_mask = get_target_zero_mask(compressed_weights, zp)
Expand All @@ -317,7 +311,6 @@ def calculate_quantization_params(
config,
precomputed_scale=scaled_scale,
precomputed_zero_point=zp,
**opt_fns_kwargs,
)
compressed_weights = fns.zeros_like(original_weight) + out

Expand All @@ -335,7 +328,6 @@ def calculate_quantization_params(
config,
precomputed_scale=near_to_ideal_scale,
precomputed_zero_point=zp,
**opt_fns_kwargs,
)
q_weights_ = fns.zeros_like(original_weight) + out

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,6 @@ def do_int_quantization(
reduction_axes: Optional[ReductionAxes] = None,
precomputed_scale: Tensor = None,
precomputed_zero_point: Tensor = None,
**kwargs,
) -> Tuple[Tensor, Tensor, Tensor]:
"""
Performs integer quantization on the given weight tensor.
Expand Down Expand Up @@ -462,9 +461,7 @@ def do_int_quantization(
if is_openvino_available() and weight.backend in [TensorBackend.ov, TensorBackend.numpy]:
from nncf.openvino.optimized_functions import do_int_quantization as do_int_quantization_ov

return do_int_quantization_ov(
weight, config, reduction_axes, precomputed_scale, precomputed_zero_point, **kwargs
)
return do_int_quantization_ov(weight, config, reduction_axes, precomputed_scale, precomputed_zero_point)
if not is_openvino_available() and weight.backend in [TensorBackend.ov, TensorBackend.numpy]:
nncf_logger.info_once(
"OpenVINO optimizations are disabled. Install OpenVINO to enable them and improve the performance."
Expand Down Expand Up @@ -498,7 +495,6 @@ def quantize_dequantize_weight(
precomputed_scale: Optional[Tensor] = None,
precomputed_zero_point: Optional[Tensor] = None,
return_compressed_weight: Optional[bool] = False,
**kwargs,
) -> Union[Tensor, Tuple[Tensor, Tensor, Tensor, Tensor]]:
"""
First quantizes the given weight tensor and then dequantizes it back to obtain float32 values.
Expand All @@ -524,7 +520,6 @@ def quantize_dequantize_weight(
precomputed_scale,
precomputed_zero_point,
return_compressed_weight,
**kwargs,
)
if not is_openvino_available() and weight.backend in [TensorBackend.ov, TensorBackend.numpy]:
nncf_logger.info_once(
Expand Down
4 changes: 2 additions & 2 deletions tests/post_training/data/wc_reference_data.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,11 @@ tinyllama_data_aware_gptq_scale_estimation_stateful_backend_OV:
num_int8: 124
metrics_xfail_reason: "Issue-148819"
tinyllama_scale_estimation_per_channel_backend_OV:
metric_value: 0.81389
metric_value: 0.80873
num_int4: 188
num_int8: 124
tinyllama_scale_estimation_per_channel_backend_TORCH:
metric_value: 0.81389
metric_value: 0.80873
num_int4: 188
num_int8: 124
atol: 0.006 # difference across devices: 0.80873 vs 0.81389
Comment on lines 42 to 46
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A reference value for PT is changed to stay aligned with OV. Interestingly, the new value for OV (0.80873) is also the value that PT backend gets on some devices.

Expand Down
2 changes: 1 addition & 1 deletion tests/post_training/data/wc_reference_data_2025.0.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ tinyllama_scale_estimation_group_size_64_backend_TORCH:
num_int8: 124
num_compressed_xfail_reason: "Issue-160006"
tinyllama_scale_estimation_per_channel_backend_TORCH:
metric_value: 0.81389
metric_value: 0.80873
num_int4: 188
num_int8: 124
atol: 0.006 # difference across devices: 0.80873 vs 0.81389
Expand Down