Skip to content

Commit 7135bbb

Browse files
OV data-free mixed precision assignment running time improvement (#3292)
### Changes After implementation of openvino optimized functions #2727 int4 compression part of the compression pipeline is now faster compared to mixed precision search part. In this PR mixed precision assignment runtime is improved using the same approach. Namely, the function `weight_lowering.get_integer_quantization_error()` is optimized. Below are results before/after for several BF16 models: | Model | Compression | Compression Time Develop (sec.) | Compression Time Branch (sec.) | Peak Memory Develop (MiB) | Peak Memory Branch (MiB) |--------------|---------------------|----------------------|------------------|---------------------------|--------------------------| | tiny-llama | int4_asym, ratio=0.8 | 22.94 | 17.03 (-26%) | 1406 | 1085 (-23%) | | phi3-mini | int4_asym, ratio=0.8 | 56.40 | 32.06 (-43%) | 3669 | 2754 (-25%) | | llama-3.1-8b | int4_asym, ratio=0.8 | 97.27 | 56.80 (-42%) | 6660 | 5505 (-17%) | ### Related tickets 161921 ### Tests Extended - tests/openvino/optimized_functions/test_compression_functions.py - tests/openvino/optimized_functions/test_ov_model_parameters.py https://github.com/openvinotoolkit/nncf/actions/runs/13412231315
1 parent bf9fe48 commit 7135bbb

File tree

13 files changed

+282
-63
lines changed

13 files changed

+282
-63
lines changed

nncf/openvino/graph/node_utils.py

+16-7
Original file line numberDiff line numberDiff line change
@@ -110,21 +110,30 @@ def cnt_if_op(model: ov.Model, cnt: int) -> int:
110110
return cnt_if_op(model, 0)
111111

112112

113-
def get_const_value(const_node: ov.Node, cast_bf16_to_fp32: bool = True) -> np.ndarray:
113+
def get_const_value_as_numpy_tensor(const_node: ov.Node) -> np.ndarray:
114114
"""
115-
Returns the constant tensor for the node.
115+
Returns the constant tensor for the node as an instance of np.ndarray. BF16 constants will be converted to FP32.
116116
This method is applicable only for the floating-point constant data.
117117
118118
:param const_node: OpenVINO node.
119-
:param cast_bf16_to_fp32: Whether to cast bf16 node data to fp32 or not. If False and the node contains bf16 data,
120-
the resulting bf16 value will be returned encoded inside a numpy.float16 array.
121119
:return: The constant value.
122120
"""
123-
if const_node.get_element_type() == ov.Type.bf16 and cast_bf16_to_fp32:
121+
if const_node.get_element_type() == ov.Type.bf16:
124122
return const_node.get_data(dtype=np.float32)
125123
return const_node.data
126124

127125

126+
def get_const_value_as_ov_tensor(const_node: ov.Node) -> ov.Tensor:
127+
"""
128+
Returns the constant tensor for the node as an instance of openvino.Tensor which is useful when BF16 constant
129+
needs to be retrieved as is.
130+
131+
:param const_node: OpenVINO node.
132+
:return: The constant value as openvino.Tensor.
133+
"""
134+
return ov.Tensor(const_node.data, const_node.get_output_shape(0), const_node.get_element_type())
135+
136+
128137
def get_bias_value(
129138
node_with_bias: NNCFNode, nncf_graph: NNCFGraph, model: ov.Model, node_mapping: Dict[str, ov.Node] = None
130139
) -> np.ndarray:
@@ -141,7 +150,7 @@ def get_bias_value(
141150
node_mapping = {op.get_friendly_name(): op for op in model.get_ops()}
142151
bias_constant = get_node_with_bias_value(get_add_bias_node(node_with_bias, nncf_graph), nncf_graph)
143152
ov_bias_constant = node_mapping[bias_constant.node_name]
144-
return get_const_value(ov_bias_constant)
153+
return get_const_value_as_numpy_tensor(ov_bias_constant)
145154

146155

147156
def get_weight_value(node_with_weight: NNCFNode, model: ov.Model, port_id: int) -> np.ndarray:
@@ -157,7 +166,7 @@ def get_weight_value(node_with_weight: NNCFNode, model: ov.Model, port_id: int)
157166
const_op_friendly_name = node_with_weight.layer_attributes.constant_attributes[port_id]["name"]
158167
friendly_name_to_op_map = {op.get_friendly_name(): op for op in model.get_ops()}
159168
const_op = friendly_name_to_op_map[const_op_friendly_name]
160-
weight_tensor = get_const_value(const_op)
169+
weight_tensor = get_const_value_as_numpy_tensor(const_op)
161170
return weight_tensor
162171

163172

nncf/openvino/optimized_functions/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
from nncf.openvino.optimized_functions.functions import astype as astype
1313
from nncf.openvino.optimized_functions.functions import do_int_quantization as do_int_quantization
14+
from nncf.openvino.optimized_functions.functions import get_integer_quantization_error as get_integer_quantization_error
1415
from nncf.openvino.optimized_functions.functions import quantize_dequantize_weight as quantize_dequantize_weight
1516
from nncf.openvino.optimized_functions.models import OVModelParameters as OVModelParameters
1617
from nncf.openvino.optimized_functions.models import clear_ov_model_cache as clear_ov_model_cache

nncf/openvino/optimized_functions/functions.py

+37
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from nncf.openvino.optimized_functions.models import get_astype_model
1818
from nncf.openvino.optimized_functions.models import get_compress_decompress_weight_model
1919
from nncf.openvino.optimized_functions.models import get_compress_weight_model
20+
from nncf.openvino.optimized_functions.models import get_quantization_error_model
2021
from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
2122
from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization
2223
from nncf.tensor import Tensor
@@ -168,6 +169,42 @@ def quantize_dequantize_weight(
168169
return decompressed_weight
169170

170171

172+
def get_integer_quantization_error(
173+
weight: Tensor,
174+
reduction_axes: ReductionAxes,
175+
config: WeightCompressionConfig,
176+
) -> float:
177+
"""
178+
Calculates a quantity characterizing the difference between floating point weights and fake quantized
179+
(compressed and decompressed) to integer ones.
180+
181+
The error is computed as follows:
182+
error = max(mean((decompressed_weight - weight)^2, axis=reduction_axes))
183+
184+
:param weight: Weight array to compress.
185+
:param reduction_axes: Axes, along which to reduce (collect) different statistics (e.g. min, max).
186+
:param config: Information on how to compress (quantize) a specific weight.
187+
:return: The quantity characterizing the error of integer quantization.
188+
"""
189+
original_weight_shape = weight.shape
190+
original_reduction_axes = reduction_axes
191+
192+
# When reduction axes are not provided, assuming that the weights are already reshaped
193+
if config.group_size != -1 and reduction_axes is not None:
194+
# weights are reshaped from [a1, r, a2] to [a1, r//gs, gs, a2]
195+
weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, config.group_size)
196+
197+
ov_model_params = OVModelParameters()
198+
ov_model_params.input_dtypes["weight"] = weight.dtype
199+
model = get_quantization_error_model(
200+
ov_model_params, config, original_weight_shape, weight.shape, original_reduction_axes, reduction_axes
201+
)
202+
203+
quantization_error = model([weight])[0].item()
204+
205+
return quantization_error
206+
207+
171208
def astype(a: Tensor, dtype: TensorDataType) -> Tensor:
172209
"""
173210
Converts the given tensor to the specified data type. Allows to convert between u4, i4, bf16 data types which are

nncf/openvino/optimized_functions/models.py

+72-7
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,6 @@ def get_compress_weight_model(
203203
scale_shape: Optional[Tuple] = None,
204204
zero_point_shape: Optional[Tuple] = None,
205205
reduction_axes: Optional[ReductionAxes] = None,
206-
return_nodes: Optional[bool] = False,
207206
) -> Union[ModelCallable, ModelAsNodes]:
208207
"""
209208
Get a model that compresses weights using the given configuration.
@@ -217,8 +216,6 @@ def get_compress_weight_model(
217216
as an input.
218217
:param reduction_axes: Optional axes to reduce the weight tensor. Not needed if scale (and z.p.) are provided as
219218
inputs.
220-
:param return_nodes: Whether to return the OV model inputs parameters and results nodes instead of the model
221-
callable.
222219
:return: A model callable that compresses weights using the given configuration. Or a model as nodes, if
223220
`return_nodes` is True.
224221
"""
@@ -233,7 +230,6 @@ def get_compress_weight_model(
233230
scale_shape,
234231
zero_point_shape,
235232
reduction_axes,
236-
return_nodes=return_nodes,
237233
)
238234

239235

@@ -278,6 +274,35 @@ def get_compress_decompress_weight_model(
278274
)
279275

280276

277+
def get_quantization_error_model(
278+
ov_model_params: OVModelParameters,
279+
config: WeightCompressionConfig,
280+
original_weight_shape: Tuple,
281+
weight_shape: Tuple,
282+
original_reduction_axes: ReductionAxes,
283+
reduction_axes: ReductionAxes,
284+
) -> ModelCallable:
285+
"""
286+
Get a model that calculates the quantization error for a given weight.
287+
288+
This function builds a model that compresses and then decompresses the given weight, and calculates the
289+
quantization error by comparing the original weight with the decompressed weight.
290+
291+
:param ov_model_params: OV model parameters.
292+
:param config: Compression configuration.
293+
:param original_weight_shape: Shape of the original weight tensor.
294+
:param weight_shape: Shape of the weight tensor to be compressed.
295+
:param original_reduction_axes: Reduction axes of the original weight tensor before reshaping.
296+
:param reduction_axes: Axes to reduce the weight tensor.
297+
:return: A model callable that returns the quantization error.
298+
"""
299+
weight_shape, _, _ = _prepare_compression_model_inputs(ov_model_params, weight_shape, None, None, reduction_axes)
300+
301+
return _build_quantization_error_model(
302+
config, ov_model_params, original_weight_shape, weight_shape, original_reduction_axes, reduction_axes
303+
)
304+
305+
281306
@cache_results(OV_MODEL_CACHE)
282307
def _build_compress_model(
283308
config: WeightCompressionConfig,
@@ -437,7 +462,8 @@ def _build_compress_decompress_model(
437462
zero_point_shape: Optional[Tuple] = None,
438463
reduction_axes: Optional[ReductionAxes] = None,
439464
return_compressed_weight: Optional[bool] = False,
440-
) -> ModelCallable:
465+
return_nodes: Optional[bool] = False,
466+
) -> Union[ModelCallable, ModelAsNodes]:
441467
default_output_dtypes = {"decompressed_weight": TensorDataType.float32}
442468
if not return_compressed_weight:
443469
# If compressed weight is not returned to a user, we can keep it in float32 to avoid additional conversion
@@ -451,8 +477,8 @@ def _build_compress_decompress_model(
451477
raise ValueError(msg)
452478

453479
# Get compression model as input/result nodes and potentially modified ov model parameters
454-
ov_parameters, ov_results, ov_model_params = get_compress_weight_model(
455-
ov_model_params, config, weight_shape, scale_shape, zero_point_shape, reduction_axes, return_nodes=True
480+
ov_parameters, ov_results, ov_model_params = _build_compress_model(
481+
config, ov_model_params, weight_shape, scale_shape, zero_point_shape, reduction_axes, return_nodes=True
456482
)
457483

458484
if config.is_asym_mode:
@@ -477,12 +503,51 @@ def _build_compress_decompress_model(
477503
decompressed_weight = opset.multiply(scale, convert_op(compressed_weight, ov.Type.f32))
478504

479505
ov_results = [decompressed_weight] + ov_results if return_compressed_weight else [decompressed_weight]
506+
507+
if return_nodes:
508+
return ov_parameters, ov_results, ov_model_params
509+
480510
model = ov.Model(ov_results, ov_parameters)
481511
compiled_model = _compile_ov_model(model, device_name="CPU", config={inference_precision(): ov.Type.f32})
482512

483513
return partial(_infer_ov_model, ov_model_params, compiled_model)
484514

485515

516+
@cache_results(OV_MODEL_CACHE)
517+
def _build_quantization_error_model(
518+
config: WeightCompressionConfig,
519+
ov_model_params: OVModelParameters,
520+
original_weight_shape: Tuple,
521+
weight_shape: Tuple,
522+
original_reduction_axes: ReductionAxes,
523+
reduction_axes: ReductionAxes,
524+
) -> ModelCallable:
525+
ov_parameters, ov_results, ov_model_params = _build_compress_decompress_model(
526+
config,
527+
ov_model_params,
528+
weight_shape,
529+
reduction_axes=reduction_axes,
530+
return_compressed_weight=False,
531+
return_nodes=True,
532+
)
533+
534+
weight = ov_parameters[0]
535+
decompressed_weight = ov_results[0]
536+
537+
weight = convert_op(opset.reshape(weight, original_weight_shape, special_zero=False), ov.Type.f32)
538+
decompressed_weight = convert_op(
539+
opset.reshape(decompressed_weight, original_weight_shape, special_zero=False), ov.Type.f32
540+
)
541+
diff = opset.squared_difference(decompressed_weight, weight)
542+
layer_err = opset.reduce_mean(diff, reduction_axes=original_reduction_axes)
543+
quantization_error = opset.reduce_max(layer_err, reduction_axes=tuple(range(len(layer_err.shape))))
544+
545+
model = ov.Model([quantization_error], ov_parameters)
546+
compiled_model = _compile_ov_model(model, device_name="CPU", config={inference_precision(): ov.Type.f32})
547+
548+
return partial(_infer_ov_model, ov_model_params, compiled_model)
549+
550+
486551
def get_astype_model(ov_model_params: OVModelParameters, input_shape: Tuple) -> ModelCallable:
487552
"""
488553
Return a model that cast the input of the given shape to the given data type. Especially useful for

nncf/quantization/algorithms/weight_compression/mixed_precision.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -139,9 +139,11 @@ def available_backends(self) -> List[BackendType]:
139139
def _set_backend_entity(self, model: TModel) -> None:
140140
model_backend = get_backend(model)
141141
if model_backend == BackendType.OPENVINO:
142-
from nncf.quantization.algorithms.weight_compression.openvino_backend import OVWeightCompressionAlgoBackend
142+
from nncf.quantization.algorithms.weight_compression.openvino_backend import (
143+
OVTensorWeightCompressionAlgoBackend,
144+
)
143145

144-
self._backend_entity = OVWeightCompressionAlgoBackend(model)
146+
self._backend_entity = OVTensorWeightCompressionAlgoBackend(model)
145147
elif model_backend == BackendType.TORCH:
146148
from nncf.quantization.algorithms.weight_compression.torch_backend import PTWeightCompressionAlgoBackend
147149

@@ -161,7 +163,10 @@ def _calc_weight_sensitivity(
161163
graph: NNCFGraph,
162164
) -> float:
163165
weight = self._backend_entity.get_weight(
164-
weight_param.node_with_weight, weight_param.weight_port_id, model, graph
166+
weight_param.node_with_weight,
167+
weight_param.weight_port_id,
168+
model,
169+
graph,
165170
)
166171
backup_config = WeightCompressionConfig()
167172
reduction_axes = weight_param.reduction_axes

nncf/quantization/algorithms/weight_compression/openvino_backend.py

+17-5
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@
3333
from nncf.openvino.graph.model_transformer import OVModelTransformer
3434
from nncf.openvino.graph.node_utils import convert_op
3535
from nncf.openvino.graph.node_utils import create_ov_const_from_tensor
36-
from nncf.openvino.graph.node_utils import get_const_value
36+
from nncf.openvino.graph.node_utils import get_const_value_as_numpy_tensor
37+
from nncf.openvino.graph.node_utils import get_const_value_as_ov_tensor
3738
from nncf.openvino.graph.node_utils import get_weight_channel_axes
3839
from nncf.openvino.graph.transformations.command_creation import OVCommandCreator
3940
from nncf.openvino.graph.transformations.commands import OVTargetPoint
@@ -131,7 +132,7 @@ def get_weight_names_and_port_ids(node: NNCFNode, graph: NNCFGraph) -> List[Tupl
131132
def get_weight(self, node_with_weight: NNCFNode, weight_port_id: int, model: ov.Model, graph: NNCFGraph) -> Tensor:
132133
weight_name = node_with_weight.layer_attributes.constant_attributes[weight_port_id]["name"]
133134
weight_node = self.name_to_node_mapping[weight_name]
134-
weight_tensor = get_const_value(weight_node)
135+
weight_tensor = get_const_value_as_numpy_tensor(weight_node)
135136
return Tensor(weight_tensor)
136137

137138
def get_weight_dtype(
@@ -298,12 +299,10 @@ def transform_model(
298299
const_node = self.name_to_node_mapping[const_node_name]
299300
const_node_output = const_node.output(0)
300301
const_dtype = const_node_output.get_element_type()
301-
weight = get_const_value(const_node, cast_bf16_to_fp32=False)
302302
# Creation of ov.Tensor is required for two reasons:
303303
# 1. To be able to process BF16 weight properly
304304
# 2. To indicate that it is allowed for the compressed constant to be returned as int4/uint4 if needed
305-
weight = ov.Tensor(weight, weight.shape, const_dtype)
306-
weight = Tensor(weight)
305+
weight = Tensor(get_const_value_as_ov_tensor(const_node))
307306

308307
should_add_convert_node = False
309308
if const_dtype != ov.Type.f16:
@@ -365,6 +364,19 @@ def filter_func(point: StatisticPoint) -> bool:
365364
return filter_func
366365

367366

367+
class OVTensorWeightCompressionAlgoBackend(OVWeightCompressionAlgoBackend):
368+
"""
369+
OpenVINO backend for weight compression algorithms that fetches model weights as openvino.Tensor instances.
370+
This allows to natively process BF16/FP16 weights.
371+
"""
372+
373+
def get_weight(self, node_with_weight: NNCFNode, weight_port_id: int, model: ov.Model, graph: NNCFGraph) -> Tensor:
374+
weight_name = node_with_weight.layer_attributes.constant_attributes[weight_port_id]["name"]
375+
weight_node = self.name_to_node_mapping[weight_name]
376+
weight_tensor = get_const_value_as_ov_tensor(weight_node)
377+
return Tensor(weight_tensor)
378+
379+
368380
class OVAWQAlgoAlgoBackend(AWQAlgoBackend, OVWeightCompressionAlgoBackend):
369381
@staticmethod
370382
def get_awq_patterns():

0 commit comments

Comments
 (0)