From 30ee587bb578ab110c52e1321b05a52fb707a8c4 Mon Sep 17 00:00:00 2001 From: XueSongTap Date: Sun, 16 Mar 2025 16:45:38 +0800 Subject: [PATCH] [ONNX]: Add support for data-free Weight Compression Algorithm (#3273) --- nncf/onnx/graph/layer_attributes.py | 57 ++ nncf/onnx/graph/layout.py | 137 +++++ nncf/onnx/graph/metatypes/groups.py | 6 + nncf/onnx/graph/node_utils.py | 46 +- nncf/onnx/quantization/quantize_model.py | 71 +++ .../weight_compression/algorithm.py | 4 + .../weight_compression/onnx_backend.py | 539 ++++++++++++++++++ nncf/quantization/quantize_model.py | 6 + nncf/tensor/functions/onnx_numeric.py | 34 ++ 9 files changed, 899 insertions(+), 1 deletion(-) create mode 100644 nncf/onnx/graph/layer_attributes.py create mode 100644 nncf/onnx/graph/layout.py create mode 100644 nncf/quantization/algorithms/weight_compression/onnx_backend.py create mode 100644 nncf/tensor/functions/onnx_numeric.py diff --git a/nncf/onnx/graph/layer_attributes.py b/nncf/onnx/graph/layer_attributes.py new file mode 100644 index 00000000000..6e60f6458c1 --- /dev/null +++ b/nncf/onnx/graph/layer_attributes.py @@ -0,0 +1,57 @@ +# Copyright (c) 2025 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict, List, Optional + +from nncf.common.graph.layer_attributes import BaseLayerAttributes + + +class ONNXLayerAttributes(BaseLayerAttributes): + """ + This class stores additional information about nodes that needs to be processed during compression. + """ + + def __init__( + self, + constant_attributes: Dict[int, Any], + layer_attributes: Optional[BaseLayerAttributes] = None, + inputs_attributes: Optional[Dict[Any, Any]] = None, + ): + """ + :param constant_attributes: Map of weights port ID to corresponding const attributes. + :param layer_attributes: Map of weights port ID to corresponding common layer attributes. + :param inputs_attributes: Activation attributes. + """ + self._constant_attributes = constant_attributes + self._layer_attributes = layer_attributes + self._inputs_attributes = inputs_attributes + + @property + def constant_attributes(self) -> Dict[int, Any]: + return self._constant_attributes + + @property + def layer_attributes(self) -> Optional[BaseLayerAttributes]: + return self._layer_attributes + + @property + def input_attributes(self) -> Optional[Dict[Any, Any]]: + return self._inputs_attributes + + def get_const_port_ids(self) -> List[int]: + """ + Returns indices of input ports corresponding to the constant nodes. + + :returns: List of input port indices with constants. + """ + if self._constant_attributes is not None: + return list(self._constant_attributes.keys()) + return [] diff --git a/nncf/onnx/graph/layout.py b/nncf/onnx/graph/layout.py new file mode 100644 index 00000000000..40acbb90444 --- /dev/null +++ b/nncf/onnx/graph/layout.py @@ -0,0 +1,137 @@ +# Copyright (c) 2025 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from enum import Enum +from typing import Tuple + +from nncf.common.graph.graph import NNCFNode +from nncf.onnx.graph.layer_attributes import ONNXLayerAttributes +from nncf.onnx.graph.metatypes.onnx_metatypes import ONNXConvolutionMetatype +from nncf.onnx.graph.metatypes.onnx_metatypes import ONNXDepthwiseConvolutionMetatype +from nncf.onnx.graph.metatypes.onnx_metatypes import ONNXGroupConvolutionMetatype +from nncf.onnx.graph.metatypes.onnx_metatypes import ONNXOpMetatype + + +class ONNXLayoutElem(Enum): + """ + Layout elements descriptor for convolutional and linear onnx layers: + C_IN: Input channels dimension. + C_OUT: Output channels dimension. + SPATIAL: Spatial dimension. + GROUPS: Groups dimension. + """ + + C_IN = "channels_in" + C_OUT = "channels_out" + SPATIAL = "spatial" + GROUPS = "groups" + + +_CONV_BASE_CONST_LAYOUT = { + ONNXConvolutionMetatype: (ONNXLayoutElem.C_OUT, ONNXLayoutElem.C_IN), + ONNXDepthwiseConvolutionMetatype: (ONNXLayoutElem.GROUPS, ONNXLayoutElem.C_OUT, ONNXLayoutElem.C_IN), + ONNXGroupConvolutionMetatype: (ONNXLayoutElem.GROUPS, ONNXLayoutElem.C_OUT, ONNXLayoutElem.C_IN), +} + + +def get_conv_weights_layout_from_node(node: NNCFNode) -> Tuple[ONNXLayoutElem]: + """ + Calculates weights layout for a target convolution node. + + :param node: Target convolution node. + :return: Target convolution Node weights layout. + """ + layer_attributes = node.layer_attributes + port_id = _get_constant_port_id_from_layer_attributes(layer_attributes) + return get_conv_weights_layout( + ONNX_metatype=node.metatype, weights_shape=layer_attributes.constant_attributes[port_id]["shape"] + ) + + +def get_linear_weights_layout_from_node(node: NNCFNode) -> Tuple[ONNXLayoutElem]: + """ + Calculates weights layout for a target linear node. + + :param node: Target linear node. + :return: Target linear Node weight layout. + """ + layer_attributes = node.layer_attributes + port_id = _get_constant_port_id_from_layer_attributes(layer_attributes) + constant_layer_attrs = layer_attributes.constant_attributes[port_id] + return get_linear_input_layout( + input_shape=constant_layer_attrs["shape"], + transpose=constant_layer_attrs["transpose"], + port_id=port_id, + ) + + +def get_linear_activations_layout_from_node( + node: NNCFNode, port_id: int, input_shape: Tuple[int] +) -> Tuple[ONNXLayoutElem]: + """ + Calculates activations layout for a target linear node. + + :param node: Target linear node. + :param port_id: Target input port ID. + :param input_shape: Shape of the input. + :return: Target linear Node weight layout. + """ + act_layer_attrs = node.layer_attributes.input_attributes + return get_linear_input_layout( + input_shape=input_shape, + transpose=act_layer_attrs["transpose"], + port_id=port_id, + ) + + +def get_conv_weights_layout(ONNX_metatype: ONNXOpMetatype, weights_shape: Tuple[int, ...]) -> Tuple[ONNXLayoutElem]: + """ + Calculates weights layout for a target convolution node. + + :param ONNX_metatype: Target convolution node OpenVINO metatype. + :param weights_shape: Shape of the target convolution node weight. + :return: Target convolution node weights layout. + """ + base_layout = _CONV_BASE_CONST_LAYOUT[ONNX_metatype] + kernel_size = weights_shape[len(base_layout) :] + weights_layout = list(base_layout) + [ONNXLayoutElem.SPATIAL] * len(kernel_size) + return tuple(weights_layout) + + +def get_linear_input_layout(input_shape: Tuple[int, ...], transpose: bool, port_id: int) -> Tuple[ONNXLayoutElem]: + """ + Calculates input layout for a target linear node. + + :param input_shape: Shape of the target linear node input. + :param port_id: Port id of the target linear node input. + :return: Target linear node input layout. + """ + input_layout = [ONNXLayoutElem.SPATIAL] * (len(input_shape) - 2) + if len(input_shape) > 1: + if (transpose and port_id == 0) or (not transpose and port_id == 1): + input_layout += [ONNXLayoutElem.C_IN, ONNXLayoutElem.C_OUT] + else: + input_layout += [ONNXLayoutElem.C_OUT, ONNXLayoutElem.C_IN] + else: + input_layout += [ONNXLayoutElem.C_IN] + return tuple(input_layout) + + +def _get_constant_port_id_from_layer_attributes(layer_attributes: ONNXLayerAttributes) -> int: + """ + Returns constant ports id for convolutional and linear ops layer attributes. + + :param layer_attributes: Target convolutional/linear layer op layer attributes. + :return: Constant port id for the target convolutional/linear model. + """ + port_ids = list(layer_attributes.constant_attributes.keys()) + assert len(port_ids) == 1 + return port_ids[0] diff --git a/nncf/onnx/graph/metatypes/groups.py b/nncf/onnx/graph/metatypes/groups.py index a721c6ae175..89caaddd177 100644 --- a/nncf/onnx/graph/metatypes/groups.py +++ b/nncf/onnx/graph/metatypes/groups.py @@ -164,3 +164,9 @@ onnx_metatypes.ONNXROIAlignMetatype, onnx_metatypes.ONNXEmbeddingMetatype, ] + +CONV_OPERATIONS = [ + onnx_metatypes.ONNXConvolutionMetatype, + onnx_metatypes.ONNXDepthwiseConvolutionMetatype, + onnx_metatypes.ONNXGroupConvolutionMetatype, +] diff --git a/nncf/onnx/graph/node_utils.py b/nncf/onnx/graph/node_utils.py index bb21e10c603..ab79616b940 100644 --- a/nncf/onnx/graph/node_utils.py +++ b/nncf/onnx/graph/node_utils.py @@ -9,7 +9,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, Optional, Tuple +from typing import Dict, Optional, Tuple, List import numpy as np import onnx @@ -18,10 +18,20 @@ from nncf.common.graph.graph import NNCFNode from nncf.common.graph.transformations.commands import TargetType from nncf.common.logging.logger import nncf_logger +from nncf.onnx.graph.layout import OVLayoutElem +from nncf.onnx.graph.layout import get_conv_weights_layout +from nncf.onnx.graph.layout import get_conv_weights_layout_from_node +from nncf.onnx.graph.layout import get_linear_activations_layout_from_node +from nncf.onnx.graph.layout import get_linear_input_layout +from nncf.onnx.graph.layout import get_linear_weights_layout_from_node from nncf.onnx.graph.metatypes import onnx_metatypes as om from nncf.onnx.graph.metatypes.onnx_metatypes import ONNXDequantizeLinearMetatype +from nncf.onnx.graph.metatypes.onnx_metatypes import ONNXMatMulMetatype from nncf.onnx.graph.onnx_helper import get_tensor_value from nncf.onnx.graph.transformations.commands import ONNXTargetPoint +from nncf.onnx.graph.metatypes.groups import CONV_OPERATIONS +from nncf.onnx.graph.metatypes.groups import OPERATIONS_WITH_BIAS +from nncf.onnx.graph.metatypes.groups import OPERATIONS_WITH_WEIGHTS def is_node_with_bias(node: NNCFNode) -> bool: @@ -139,6 +149,36 @@ def get_weight_quantization_axis(node: NNCFNode, port_id: int) -> int: weight_channel_axis = -1 - port_id if transpose else -2 + port_id return weight_channel_axis +def get_weight_channel_axes(node: NNCFNode) -> List[int]: + """ + Returns axes numbers of the weight tensor which correspond to its channels. + + :param node: NNCFNode with weights. + :param weights_port_id: Weight port id of the target node. + :return: Axes numbers of the weight tensor which correspond to its channels. + """ + if node.metatype not in OPERATIONS_WITH_WEIGHTS: + msg = "Channel axis cannot be defined for operation without weights." + raise ValueError(msg) + + if node.metatype in CONV_OPERATIONS: + weights_layout = get_conv_weights_layout_from_node(node) + return [idx for idx, elem in enumerate(weights_layout) if elem in [OVLayoutElem.GROUPS, OVLayoutElem.C_OUT]] + elif node.metatype == ONNXMatMulMetatype: + return get_matmul_channel_axes(node) + return node.metatype.const_channel_axis + + +def get_matmul_channel_axes(node: ov.Node) -> List[int]: + """ + Calculate channel axes for the MatMul operation. + + :param node: The target node. + :return: List of channel axes for the MatMul operation. + """ + weights_layout = get_linear_weights_layout_from_node(node) + return [idx for idx, elem in enumerate(weights_layout) if elem in [OVLayoutElem.SPATIAL, OVLayoutElem.C_OUT]] + def get_act_quantization_axis(node: NNCFNode, port_id: int) -> int: """ @@ -214,3 +254,7 @@ def get_quantized_tensor_shape( if target_point.is_weight_target_point(): return node.layer_attributes.weight_attrs[target_point.port_id]["shape"] return _get_activation_tensor_shape(nncf_graph, node, target_point) + + +def get_const_value_as_onnx_tensor(initializer_name: str, model: onnx.ModelProto) -> np.ndarray: + # TODO \ No newline at end of file diff --git a/nncf/onnx/quantization/quantize_model.py b/nncf/onnx/quantization/quantize_model.py index 6fe92206444..cc1816f4932 100644 --- a/nncf/onnx/quantization/quantize_model.py +++ b/nncf/onnx/quantization/quantize_model.py @@ -9,29 +9,40 @@ # See the License for the specific language governing permissions and # limitations under the License. +from pathlib import Path from typing import Any, Callable, Iterable, List, Optional, Tuple, TypeVar, Union import onnx import nncf +from nncf.common.factory import NNCFGraphFactory +from nncf.common.factory import StatisticsAggregatorFactory from nncf.common.logging.logger import nncf_logger from nncf.common.quantization.structs import QuantizationPreset from nncf.data import Dataset from nncf.onnx.graph.metatypes.groups import OPERATIONS_OUTPUT_HAS_NO_BATCH_AXIS from nncf.onnx.graph.nncf_graph_builder import GraphConverter +from nncf.parameters import BackupMode +from nncf.parameters import CompressionFormat +from nncf.parameters import CompressWeightsMode from nncf.parameters import DropType from nncf.parameters import ModelType from nncf.parameters import QuantizationMode +from nncf.parameters import SensitivityMetric from nncf.parameters import TargetDevice from nncf.quantization.advanced_parameters import AdvancedAccuracyRestorerParameters +from nncf.quantization.advanced_parameters import AdvancedCompressionParameters from nncf.quantization.advanced_parameters import AdvancedQuantizationParameters from nncf.quantization.advanced_parameters import QuantizationParameters from nncf.quantization.algorithms.accuracy_control.algorithm import QuantizationAccuracyRestorer from nncf.quantization.algorithms.accuracy_control.algorithm import calculate_accuracy_drop from nncf.quantization.algorithms.accuracy_control.evaluator import Evaluator from nncf.quantization.algorithms.post_training.algorithm import PostTrainingQuantization +from nncf.quantization.algorithms.weight_compression.algorithm import WeightCompression from nncf.quantization.quantize_model import quantize_with_tune_hyperparams from nncf.quantization.quantize_model import warning_model_no_batchwise_support +from nncf.quantization.statistics_caching import cache_weight_compression_statistics +from nncf.quantization.statistics_caching import register_statistics_for_algorithm from nncf.scopes import IgnoredScope TTensor = TypeVar("TTensor") @@ -201,3 +212,63 @@ def quantize_with_accuracy_control_impl( ) return quantized_model + +def compress_weights_impl( + model: onnx.ModelProto, + dataset: Dataset, + mode: CompressWeightsMode, + ratio: float, + group_size: int, + ignored_scope: IgnoredScope, + all_layers: bool, + sensitivity_metric: SensitivityMetric, + awq: bool, + subset_size: int, + scale_estimation: bool, + gptq: bool, + lora_correction: bool, + backup_mode: BackupMode, + compression_format: CompressionFormat, + advanced_parameters: Optional[AdvancedCompressionParameters] = None, +) -> onnx.ModelProto: + """ + Implementation of the `compress_weights()` method for the OpenVINO backend. + """ + graph = NNCFGraphFactory.create(model) + compression_algorithm = WeightCompression( + mode, + ratio, + group_size, + ignored_scope, + all_layers, + sensitivity_metric, + awq, + subset_size, + scale_estimation, + gptq, + lora_correction, + backup_mode, + compression_format, + advanced_parameters, + ) + + statistics_points = None + if advanced_parameters and advanced_parameters.statistics_path: + # If there is no such directory, then caches statistics + statistics_path = Path(advanced_parameters.statistics_path) + if not statistics_path.exists(): + cache_weight_compression_statistics(model, graph, dataset, subset_size, statistics_path) + statistics_aggregator = StatisticsAggregatorFactory.create(model, dataset) + compression_algorithm.set_backend_entity(model) + _, matmul_input_to_output_nodes_map = compression_algorithm.get_compression_nodes_info(graph) + register_statistics_for_algorithm( + statistics_aggregator, + model, + graph, + compression_algorithm, + matmul_input_to_output_nodes_map, + ) + statistics_aggregator.load_statistics_from_dir(statistics_path) + statistics_points = statistics_aggregator.statistic_points + + return compression_algorithm.apply(model, graph, statistics_points, dataset) diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py index 6169f4bada3..ae945f2eb1a 100644 --- a/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -323,6 +323,10 @@ def set_backend_entity(self, model: TModel) -> None: from nncf.quantization.algorithms.weight_compression.torch_fx_backend import FXWeightCompressionAlgoBackend self._backend_entity = FXWeightCompressionAlgoBackend() + elif model_backend == BackendType.ONNX: + from nncf.quantization.algorithms.weight_compression.onnx_backend import ONNXWeightCompressionAlgoBackend + + self._backend_entity = ONNXWeightCompressionAlgoBackend() else: msg = f"Cannot return backend-specific entity because {model_backend.value} is not supported!" raise nncf.UnsupportedBackendError(msg) diff --git a/nncf/quantization/algorithms/weight_compression/onnx_backend.py b/nncf/quantization/algorithms/weight_compression/onnx_backend.py new file mode 100644 index 00000000000..a25c22e0916 --- /dev/null +++ b/nncf/quantization/algorithms/weight_compression/onnx_backend.py @@ -0,0 +1,539 @@ +# Copyright (c) 2025 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Callable, Dict, Iterable, List, Optional, Tuple + +import onnx + +import nncf +from nncf.common.graph import NNCFGraph +from nncf.common.graph import NNCFNode +from nncf.common.graph.operator_metatypes import OperatorMetatype +from nncf.common.graph.transformations.commands import TargetType +from nncf.common.graph.utils import get_reduction_axes +from nncf.common.tensor_statistics.statistic_point import StatisticPoint +from nncf.common.utils.caching import disable_results_caching +from nncf.experimental.common.tensor_statistics.collectors import MeanAggregator +from nncf.experimental.common.tensor_statistics.collectors import NoopAggregator +from nncf.experimental.common.tensor_statistics.collectors import TensorCollector +from nncf.experimental.common.tensor_statistics.statistics import MaxVarianceTensorStatistic +from nncf.experimental.common.tensor_statistics.statistics import MeanMagnitudeTensorStatistic +from nncf.experimental.common.tensor_statistics.statistics import MeanVarianceTensorStatistic +from nncf.experimental.common.tensor_statistics.statistics import WCTensorStatistic + +from nncf.onnx.graph.metatypes import onnx_metatypes as om +from nncf.onnx.graph.metatypes.groups import ATOMIC_ACTIVATIONS_OPERATIONS +from nncf.onnx.graph.model_transformer import ONNXModelTransformer +from nncf.onnx.graph.node_utils import convert_op +from nncf.onnx.graph.node_utils import create_ov_const_from_tensor +from nncf.onnx.graph.node_utils import get_const_value_as_numpy_tensor +from nncf.onnx.graph.node_utils import get_const_value_as_onnx_tensor +from nncf.onnx.graph.node_utils import get_weight_channel_axes +from nncf.onnx.graph.transformations.command_creation import OVCommandCreator +from nncf.onnx.graph.transformations.commands import OVTargetPoint +from nncf.onnx.optimized_functions import clear_onnx_model_cache +from nncf.onnx.optimized_functions.models import ONNX_MODEL_CACHE +from nncf.onnx.rt_info import dump_parameters +from nncf.onnx.statistics.collectors import OVMaxVarianceReducer +from nncf.onnx.statistics.collectors import OVMeanAbsMaxReducer +from nncf.onnx.statistics.collectors import OVMeanReducer +from nncf.onnx.statistics.collectors import OVMeanVarianceReducer +from nncf.onnx.statistics.collectors import OVShapeReducer + + +from nncf.parameters import CompressionFormat +from nncf.parameters import CompressWeightsMode +from nncf.quantization.advanced_parameters import AdvancedCompressionParameters +from nncf.quantization.algorithms.weight_compression.awq_patterns import get_awq_patterns +from nncf.quantization.algorithms.weight_compression.backend import AWQAlgoBackend +from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend +from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend +from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig +from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters +from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm +from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight +from nncf.tensor import Tensor +from nncf.tensor.definitions import TensorDataType +from nncf.tensor.functions.onnx_numeric import DTYPE_MAP_REV + + +class ONNXWeightCompressionAlgoBackend(WeightCompressionAlgoBackend): + + @property + def matmul_metatypes(self) -> List[OperatorMetatype]: + return [om.ONNXMatMulMetatype] + + @property + def convolution_metatypes(self) -> List[OperatorMetatype]: + # TODO: Add more convolution metatypes + return [ + om.ONNXConvolutionMetatype, + om.ONNXDepthwiseConvolutionMetatype, + om.ONNXGroupConvolutionMetatype, + ] + + @property + def embedding_metatypes(self) -> List[OperatorMetatype]: + return [om.ONNXEmbeddingMetatype] + + @staticmethod + def is_node_with_weights(node: NNCFNode, graph: NNCFGraph) -> bool: + return node.layer_attributes and node.layer_attributes.constant_attributes + + @staticmethod + def get_reduction_axes(node_with_weight: NNCFNode, weight_port_id: int, graph: NNCFGraph) -> Optional[Tuple[int]]: + channel_axes = get_weight_channel_axes(node_with_weight) + const_shape = node_with_weight.layer_attributes.constant_attributes[weight_port_id]["shape"] + return get_reduction_axes(channel_axes, const_shape) + + @staticmethod + def target_point(target_type: TargetType, target_node_name: str, port_id: int) -> OVTargetPoint: + return OVTargetPoint(target_type, target_node_name, port_id) + + def mean_statistic_collector( + self, reduction_axes: Tuple[int], subset_size: Optional[int] = None + ) -> TensorCollector: + mean_reducer = ONNXMeanReducer(reduction_axes, inplace=True) + shape_reducer = ONNXShapeReducer(inplace=True) + collector = TensorCollector(WCTensorStatistic) + collector.register_statistic_branch(WCTensorStatistic.MEAN_STAT, mean_reducer, NoopAggregator(subset_size)) + collector.register_statistic_branch(WCTensorStatistic.SHAPE_STAT, shape_reducer, NoopAggregator(subset_size)) + return collector + + @staticmethod + def get_activation_port_id(node: NNCFNode, nncf_graph: NNCFGraph) -> int: + if node.layer_attributes.input_attributes["transpose"]: + msg = "Transposed input is not supported" + raise nncf.UnsupportedModelError(msg) + constant_ports = node.layer_attributes.get_const_port_ids() + activation_ports = [ + e.input_port_id for e in nncf_graph.get_input_edges(node) if e.input_port_id not in constant_ports + ] + assert len(activation_ports) == 1 + return activation_ports[0] + + @staticmethod + def get_weight_names_and_port_ids(node: NNCFNode, graph: NNCFGraph) -> List[Tuple[str, int]]: + result = [] + for weight_port_id in node.layer_attributes.get_const_port_ids(): + weight_name = node.layer_attributes.constant_attributes[weight_port_id]["name"] + result.append((weight_name, weight_port_id)) + return result + + def get_weight(self, node_with_weight: NNCFNode, weight_port_id: int, model: onnx.ModelProtoo, graph: NNCFGraph) -> Tensor: + weight_name = node_with_weight.layer_attributes.constant_attributes[weight_port_id]["name"] + weight_node = self.name_to_node_mapping[weight_name] + weight_tensor = get_const_value_as_numpy_tensor(weight_node) + return Tensor(weight_tensor) + + def get_weight_dtype( + self, node_with_weight: NNCFNode, weight_port_id: int, model: onnx.ModelProto, graph: NNCFGraph + ) -> TensorDataType: + onnx_type_name = node_with_weight.layer_attributes.constant_attributes[weight_port_id]["dtype"] + onnx_type = getattr(onnx.TensorProto.DataType, onnx_type_name) + return DTYPE_MAP_REV[onnx_type] + + @staticmethod + def get_weight_shape(node_with_weight: NNCFNode, weight_port_id: int, graph: NNCFGraph) -> Tuple: + return node_with_weight.layer_attributes.constant_attributes[weight_port_id]["shape"] + + def _create_compression_subgraph( + self, + weight: Tensor, + compression_config: WeightCompressionConfig, + reduction_axes: Tuple[int, ...], + const_node_name: str, + weight_port_id: int, + const_dtype, + should_add_convert_node: bool, + layer_scales: Optional[Tensor] = None, + layer_zero_points: Optional[Tensor] = None, + ): + if compression_config.mode == CompressWeightsMode.INT8_SYM: + compression_dtype = onnx.TensorProto.INT8 + is_symmetric = True + elif compression_config.mode == CompressWeightsMode.INT8_ASYM: + compression_dtype = onnx.TensorProto.UINT8 + is_symmetric = False + elif compression_config.mode in [CompressWeightsMode.INT4_SYM, CompressWeightsMode.INT4_ASYM, + CompressWeightsMode.NF4, CompressWeightsMode.E2M1]: + msg = f"{compression_config.mode.value} is not directly supported in ONNX backend yet." + raise nncf.ParameterNotSupportedError(msg) + else: + msg = f"{compression_config.mode.value} is not supported." + raise nncf.ParameterNotSupportedError(msg) + + original_shape = weight.shape + + compressed_weight = compress_weight( + weight, + reduction_axes, + compression_config, + layer_scales, + layer_zero_points, + ) + + prefix = f"{const_node_name}_{weight_port_id}" + + compressed_tensor_name = f"{prefix}_compressed" + if is_symmetric: + compressed_data = compressed_weight.tensor.data.astype(np.int8) + else: + compressed_data = compressed_weight.tensor.data.astype(np.uint8) + + compressed_initializer = numpy_helper.from_array(compressed_data, compressed_tensor_name) + self.new_initializers.append(compressed_initializer) + + # 创建缩放因子的初始化器 + scale_tensor_name = f"{prefix}_scale" + scale_data = compressed_weight.scale.data + scale_initializer = numpy_helper.from_array(scale_data.astype(np.float32), scale_tensor_name) + self.new_initializers.append(scale_initializer) + + # 执行解量化操作 + # 步骤1: Cast操作 - 将压缩的整数转换为浮点数 + cast_output_name = f"{prefix}_casted" + cast_node = onnx.helper.make_node( + 'Cast', + inputs=[compressed_tensor_name], + outputs=[cast_output_name], + name=f"{prefix}_cast", + to=onnx.TensorProto.FLOAT # 转换为浮点数 + ) + self.new_nodes.append(cast_node) + + # 对于非对称量化,处理零点 + if not is_symmetric and compressed_weight.zero_point is not None: + # 创建零点的初始化器 + zero_point_tensor_name = f"{prefix}_zero_point" + zero_point_data = compressed_weight.zero_point.data + if is_symmetric: + zero_point_data = zero_point_data.astype(np.int8) + else: + zero_point_data = zero_point_data.astype(np.uint8) + + zero_point_initializer = numpy_helper.from_array(zero_point_data, zero_point_tensor_name) + self.new_initializers.append(zero_point_initializer) + + # 创建零点的Cast节点 + zp_cast_output_name = f"{prefix}_zp_casted" + zp_cast_node = onnx.helper.make_node( + 'Cast', + inputs=[zero_point_tensor_name], + outputs=[zp_cast_output_name], + name=f"{prefix}_zp_cast", + to=onnx.TensorProto.FLOAT + ) + self.new_nodes.append(zp_cast_node) + + # 减去零点 + sub_output_name = f"{prefix}_sub_zp" + sub_node = onnx.helper.make_node( + 'Sub', + inputs=[cast_output_name, zp_cast_output_name], + outputs=[sub_output_name], + name=f"{prefix}_sub" + ) + self.new_nodes.append(sub_node) + + # 更新当前输出名称,用于下一步乘以缩放因子 + current_output_name = sub_output_name + else: + # 如果是对称量化,不需要减去零点 + current_output_name = cast_output_name + + # 乘以缩放因子 + mul_output_name = f"{prefix}_dequantized" + mul_node = onnx.helper.make_node( + 'Mul', + inputs=[current_output_name, scale_tensor_name], + outputs=[mul_output_name], + name=f"{prefix}_mul" + ) + self.new_nodes.append(mul_node) + + # 处理分组量化的重塑操作 + if compression_config.group_size != -1: + reshape_output_name = f"{prefix}_reshaped" + # 创建原始形状的初始化器 + shape_tensor_name = f"{prefix}_shape" + shape_data = np.array(original_shape, dtype=np.int64) + shape_initializer = numpy_helper.from_array(shape_data, shape_tensor_name) + self.new_initializers.append(shape_initializer) + + # 创建Reshape节点 + reshape_node = onnx.helper.make_node( + 'Reshape', + inputs=[mul_output_name, shape_tensor_name], + outputs=[reshape_output_name], + name=f"{prefix}_reshape" + ) + self.new_nodes.append(reshape_node) + current_output_name = reshape_output_name + else: + current_output_name = mul_output_name + + # 如果需要,添加额外的类型转换节点 + if should_add_convert_node: + final_output_name = f"{prefix}_final" + convert_node = onnx.helper.make_node( + 'Cast', + inputs=[current_output_name], + outputs=[final_output_name], + name=f"{prefix}_final_cast", + to=self._convert_dtype_to_onnx(const_dtype) # 转换为原始常量的数据类型 + ) + self.new_nodes.append(convert_node) + final_node = convert_node + else: + final_output_name = current_output_name + # 找到最后创建的节点 + final_node = self.new_nodes[-1] + + return final_node, compressed_weight + + def _convert_dtype_to_onnx(self, dtype): + """将内部数据类型转换为ONNX的数据类型""" + dtype_mapping = { + "float32": onnx.TensorProto.FLOAT, + "float16": onnx.TensorProto.FLOAT16, + "int8": onnx.TensorProto.INT8, + "uint8": onnx.TensorProto.UINT8, + } + + if isinstance(dtype, str): + if dtype in dtype_mapping: + return dtype_mapping[dtype] + + if isinstance(dtype, int) and dtype in [item for item in dtype_mapping.values()]: + return dtype + + return onnx.TensorProto.FLOAT + + def transform_model( + self, + model: onnx.ModelProto, + graph: NNCFGraph, + weight_compression_parameters: Iterable[WeightCompressionParameters], + precomputed_scales: Dict[str, Tensor] = None, + precomputed_zero_points: Dict[str, Tensor] = None, + lora_correction_algo: LoraCorrectionAlgorithm = None, + compression_format: CompressionFormat = CompressionFormat.DQ, + advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(), + ) -> onnx.ModelProto: + # TODO 完成 ONNXWeightCompressionAlgoBackend 类中的函数, 只考虑最简单的 weightcompression + compressed_model = onnx.ModelProto() + compressed_model.CopyFrom(model) + onnx_graph = compressed_model.graph + + # Create mappings for easier lookup + initializers = {init.name: (i, init) for i, init in enumerate(onnx_graph.initializer)} + nodes = {node.name: node for node in onnx_graph.node} + + # Track newly added nodes and initializers + new_nodes = [] + new_initializers = [] + + for wc_params in weight_compression_parameters: + # Get weight node information + weight_node = get_const_node(wc_params.node_with_weight, wc_params.weight_port_id, graph) + weight_name = weight_node.layer_attributes.name + + # Check if weight exists in initializers + if weight_name not in initializers: + msg = f"Could not find weight tensor '{weight_name}' in ONNX model initializers." + raise nncf.InternalError(msg) + + # Get weight data from ONNX model + _, weight_initializer = initializers[weight_name] + weight_np = numpy_helper.to_array(weight_initializer) + weight = Tensor(weight_np) + + # Check if compression mode is supported + compression_config = wc_params.compression_config + if compression_config.mode in [ + CompressWeightsMode.NF4, + CompressWeightsMode.E2M1, + ]: + msg = f"{compression_config.mode.value} is not supported for ONNX backend." + raise nncf.ParameterNotSupportedError(msg) + + # Find nodes that use this weight + consumer_nodes = [] + for node in onnx_graph.node: + if weight_name in node.input: + consumer_nodes.append(node) + + if not consumer_nodes: + continue # Skip if no nodes use this weight + + # Compress weight + compressed_weight = compress_weight( + weight, + wc_params.reduction_axes, + compression_config, + None if precomputed_scales is None else precomputed_scales.get(wc_params.weight_name), + None if precomputed_zero_points is None else precomputed_zero_points.get(wc_params.weight_name), + ) + + # Create decompression subgraph based on compression mode + if compression_config.mode == CompressWeightsMode.INT8_SYM: + # Create compressed weight initializer (int8) + compressed_data = compressed_weight.tensor.data.numpy().astype(np.int8) + compressed_weight_name = f"{weight_name}_compressed" + compressed_initializer = numpy_helper.from_array(compressed_data, compressed_weight_name) + new_initializers.append(compressed_initializer) + + # Create scale initializer + scale_data = compressed_weight.scale.data.numpy() + scale_name = f"{weight_name}_scale" + scale_initializer = numpy_helper.from_array(scale_data, scale_name) + new_initializers.append(scale_initializer) + + # Create cast node (int8 -> float) + cast_output_name = f"{weight_name}_casted" + cast_node = onnx.helper.make_node( + 'Cast', + inputs=[compressed_weight_name], + outputs=[cast_output_name], + name=f"{weight_name}_cast", + to=onnx.TensorProto.FLOAT + ) + new_nodes.append(cast_node) + + # Create multiplication node (apply scale) + dequantized_name = f"{weight_name}_dequantized" + mul_node = onnx.helper.make_node( + 'Mul', + inputs=[cast_output_name, scale_name], + outputs=[dequantized_name], + name=f"{weight_name}_dequant" + ) + new_nodes.append(mul_node) + + # Replace weight usage in consumer nodes + for node in consumer_nodes: + for i, input_name in enumerate(node.input): + if input_name == weight_name: + node.input[i] = dequantized_name + + elif compression_config.mode == CompressWeightsMode.INT8_ASYM: + # Create compressed weight initializer (uint8) + compressed_data = compressed_weight.tensor.data.numpy().astype(np.uint8) + compressed_weight_name = f"{weight_name}_compressed" + compressed_initializer = numpy_helper.from_array(compressed_data, compressed_weight_name) + new_initializers.append(compressed_initializer) + + # Create scale initializer + scale_data = compressed_weight.scale.data.numpy() + scale_name = f"{weight_name}_scale" + scale_initializer = numpy_helper.from_array(scale_data, scale_name) + new_initializers.append(scale_initializer) + + # Create zero point initializer + zero_point_data = compressed_weight.zero_point.data.numpy() + zero_point_name = f"{weight_name}_zero_point" + zero_point_initializer = numpy_helper.from_array(zero_point_data, zero_point_name) + new_initializers.append(zero_point_initializer) + + # Create cast node (uint8 -> float) + cast_output_name = f"{weight_name}_casted" + cast_node = onnx.helper.make_node( + 'Cast', + inputs=[compressed_weight_name], + outputs=[cast_output_name], + name=f"{weight_name}_cast", + to=onnx.TensorProto.FLOAT + ) + new_nodes.append(cast_node) + + # Create subtraction node (subtract zero point) + sub_output_name = f"{weight_name}_sub_zp" + sub_node = onnx.helper.make_node( + 'Sub', + inputs=[cast_output_name, zero_point_name], + outputs=[sub_output_name], + name=f"{weight_name}_sub_zp" + ) + new_nodes.append(sub_node) + + # Create multiplication node (apply scale) + dequantized_name = f"{weight_name}_dequantized" + mul_node = onnx.helper.make_node( + 'Mul', + inputs=[sub_output_name, scale_name], + outputs=[dequantized_name], + name=f"{weight_name}_dequant" + ) + new_nodes.append(mul_node) + + # Replace weight usage in consumer nodes + for node in consumer_nodes: + for i, input_name in enumerate(node.input): + if input_name == weight_name: + node.input[i] = dequantized_name + + elif compression_config.mode in [CompressWeightsMode.INT4_SYM, CompressWeightsMode.INT4_ASYM]: + # For INT4 formats, since ONNX doesn't natively support int4, + # we need to pack int4 values into int8 or uint8 tensors + # This requires careful handling of the unpacking logic + + # Implementation would be similar to INT8 cases but with additional handling + # for the packed format and unpacking operations + + # For this implementation, let's assume INT4 is not fully supported yet + msg = f"{compression_config.mode.value} is not fully implemented for ONNX backend yet." + raise nncf.ParameterNotSupportedError(msg) + + # Add new initializers and nodes to the graph + for initializer in new_initializers: + onnx_graph.initializer.append(initializer) + + for node in new_nodes: + onnx_graph.node.append(node) + + # Remove original weight initializers that have been replaced + # This is optional but helps reduce model size + original_initializers_to_keep = [] + for i, initializer in enumerate(onnx_graph.initializer): + if any(initializer.name in node.input for node in onnx_graph.node): + original_initializers_to_keep.append(initializer) + + # Clear and re-add initializers + del onnx_graph.initializer[:] + for initializer in original_initializers_to_keep + new_initializers: + if initializer not in onnx_graph.initializer: + onnx_graph.initializer.append(initializer) + + # Verify the model is valid + try: + onnx.checker.check_model(compressed_model) + except Exception as e: + raise nncf.InternalError(f"Generated ONNX model is invalid: {str(e)}") + + return compressed_model + + @staticmethod + def dump_parameters( + model: onnx.ModelProto, parameters: Dict, algo_name: Optional[str] = "quantization", path: Optional[List] = None + ) -> None: + dump_parameters(model, parameters, algo_name, path) + + @staticmethod + def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]: + def filter_func(point: StatisticPoint) -> bool: + return ( + algorithm_key in point.algorithm_to_tensor_collectors + and point.target_point.type == TargetType.POST_LAYER_OPERATION + and point.target_point.port_id == activation_port_id + ) + + return filter_func + diff --git a/nncf/quantization/quantize_model.py b/nncf/quantization/quantize_model.py index 01d770262c0..5a4af2b9bd8 100644 --- a/nncf/quantization/quantize_model.py +++ b/nncf/quantization/quantize_model.py @@ -619,6 +619,12 @@ def compress_weights( raise nncf.ParameterNotSupportedError(msg) compression_weights_impl = ov_compress_weights_impl + + if backend == BackendType.ONNX: + from nncf.onnx.quantization.quantize_model import compress_weights_impl as onnx_compress_weights_impl + + compression_weights_impl = onnx_compress_weights_impl + check_user_compression_configuration( mode, subset_size, diff --git a/nncf/tensor/functions/onnx_numeric.py b/nncf/tensor/functions/onnx_numeric.py new file mode 100644 index 00000000000..1a271bf6d9c --- /dev/null +++ b/nncf/tensor/functions/onnx_numeric.py @@ -0,0 +1,34 @@ +# Copyright (c) 2025 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Dict, Tuple, Union + +import onnx +from numpy.typing import NDArray + +from nncf.tensor import Tensor +from nncf.tensor import TensorDataType +from nncf.tensor.definitions import TensorBackend +from nncf.tensor.definitions import TensorDeviceType +from nncf.tensor.definitions import TypeInfo +from nncf.tensor.functions import numeric + +ONNX_DTYPE_MAP: Dict[TensorDataType, int] = { + TensorDataType.float16: onnx.TensorProto.DataType.FLOAT16, + TensorDataType.bfloat16: onnx.TensorProto.DataType.BFLOAT16, + TensorDataType.float32: onnx.TensorProto.DataType.FLOAT, + TensorDataType.float64: onnx.TensorProto.DataType.DOUBLE, + TensorDataType.int8: onnx.TensorProto.DataType.INT8, + TensorDataType.int32: onnx.TensorProto.DataType.INT32, + TensorDataType.int64: onnx.TensorProto.DataType.INT64, + TensorDataType.uint8: onnx.TensorProto.DataType.UINT8, +} + +DTYPE_MAP_REV = {v: k for k, v in ONNX_DTYPE_MAP.items()} \ No newline at end of file