From 30ee587bb578ab110c52e1321b05a52fb707a8c4 Mon Sep 17 00:00:00 2001
From: XueSongTap <tap91624@gmail.com>
Date: Sun, 16 Mar 2025 16:45:38 +0800
Subject: [PATCH] [ONNX]: Add support for data-free Weight Compression
 Algorithm (#3273)

---
 nncf/onnx/graph/layer_attributes.py           |  57 ++
 nncf/onnx/graph/layout.py                     | 137 +++++
 nncf/onnx/graph/metatypes/groups.py           |   6 +
 nncf/onnx/graph/node_utils.py                 |  46 +-
 nncf/onnx/quantization/quantize_model.py      |  71 +++
 .../weight_compression/algorithm.py           |   4 +
 .../weight_compression/onnx_backend.py        | 539 ++++++++++++++++++
 nncf/quantization/quantize_model.py           |   6 +
 nncf/tensor/functions/onnx_numeric.py         |  34 ++
 9 files changed, 899 insertions(+), 1 deletion(-)
 create mode 100644 nncf/onnx/graph/layer_attributes.py
 create mode 100644 nncf/onnx/graph/layout.py
 create mode 100644 nncf/quantization/algorithms/weight_compression/onnx_backend.py
 create mode 100644 nncf/tensor/functions/onnx_numeric.py

diff --git a/nncf/onnx/graph/layer_attributes.py b/nncf/onnx/graph/layer_attributes.py
new file mode 100644
index 00000000000..6e60f6458c1
--- /dev/null
+++ b/nncf/onnx/graph/layer_attributes.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2025 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List, Optional
+
+from nncf.common.graph.layer_attributes import BaseLayerAttributes
+
+
+class ONNXLayerAttributes(BaseLayerAttributes):
+    """
+    This class stores additional information about nodes that needs to be processed during compression.
+    """
+
+    def __init__(
+        self,
+        constant_attributes: Dict[int, Any],
+        layer_attributes: Optional[BaseLayerAttributes] = None,
+        inputs_attributes: Optional[Dict[Any, Any]] = None,
+    ):
+        """
+        :param constant_attributes: Map of weights port ID to corresponding const attributes.
+        :param layer_attributes: Map of weights port ID to corresponding common layer attributes.
+        :param inputs_attributes: Activation attributes.
+        """
+        self._constant_attributes = constant_attributes
+        self._layer_attributes = layer_attributes
+        self._inputs_attributes = inputs_attributes
+
+    @property
+    def constant_attributes(self) -> Dict[int, Any]:
+        return self._constant_attributes
+
+    @property
+    def layer_attributes(self) -> Optional[BaseLayerAttributes]:
+        return self._layer_attributes
+
+    @property
+    def input_attributes(self) -> Optional[Dict[Any, Any]]:
+        return self._inputs_attributes
+
+    def get_const_port_ids(self) -> List[int]:
+        """
+        Returns indices of input ports corresponding to the constant nodes.
+
+        :returns: List of input port indices with constants.
+        """
+        if self._constant_attributes is not None:
+            return list(self._constant_attributes.keys())
+        return []
diff --git a/nncf/onnx/graph/layout.py b/nncf/onnx/graph/layout.py
new file mode 100644
index 00000000000..40acbb90444
--- /dev/null
+++ b/nncf/onnx/graph/layout.py
@@ -0,0 +1,137 @@
+# Copyright (c) 2025 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+from typing import Tuple
+
+from nncf.common.graph.graph import NNCFNode
+from nncf.onnx.graph.layer_attributes import ONNXLayerAttributes
+from nncf.onnx.graph.metatypes.onnx_metatypes import ONNXConvolutionMetatype
+from nncf.onnx.graph.metatypes.onnx_metatypes import ONNXDepthwiseConvolutionMetatype
+from nncf.onnx.graph.metatypes.onnx_metatypes import ONNXGroupConvolutionMetatype
+from nncf.onnx.graph.metatypes.onnx_metatypes import ONNXOpMetatype
+
+
+class ONNXLayoutElem(Enum):
+    """
+    Layout elements descriptor for convolutional and linear onnx layers:
+        C_IN: Input channels dimension.
+        C_OUT: Output channels dimension.
+        SPATIAL: Spatial dimension.
+        GROUPS: Groups dimension.
+    """
+
+    C_IN = "channels_in"
+    C_OUT = "channels_out"
+    SPATIAL = "spatial"
+    GROUPS = "groups"
+
+
+_CONV_BASE_CONST_LAYOUT = {
+    ONNXConvolutionMetatype: (ONNXLayoutElem.C_OUT, ONNXLayoutElem.C_IN),
+    ONNXDepthwiseConvolutionMetatype: (ONNXLayoutElem.GROUPS, ONNXLayoutElem.C_OUT, ONNXLayoutElem.C_IN),
+    ONNXGroupConvolutionMetatype: (ONNXLayoutElem.GROUPS, ONNXLayoutElem.C_OUT, ONNXLayoutElem.C_IN),
+}
+
+
+def get_conv_weights_layout_from_node(node: NNCFNode) -> Tuple[ONNXLayoutElem]:
+    """
+    Calculates weights layout for a target convolution node.
+
+    :param node: Target convolution node.
+    :return: Target convolution Node weights layout.
+    """
+    layer_attributes = node.layer_attributes
+    port_id = _get_constant_port_id_from_layer_attributes(layer_attributes)
+    return get_conv_weights_layout(
+        ONNX_metatype=node.metatype, weights_shape=layer_attributes.constant_attributes[port_id]["shape"]
+    )
+
+
+def get_linear_weights_layout_from_node(node: NNCFNode) -> Tuple[ONNXLayoutElem]:
+    """
+    Calculates weights layout for a target linear node.
+
+    :param node: Target linear node.
+    :return: Target linear Node weight layout.
+    """
+    layer_attributes = node.layer_attributes
+    port_id = _get_constant_port_id_from_layer_attributes(layer_attributes)
+    constant_layer_attrs = layer_attributes.constant_attributes[port_id]
+    return get_linear_input_layout(
+        input_shape=constant_layer_attrs["shape"],
+        transpose=constant_layer_attrs["transpose"],
+        port_id=port_id,
+    )
+
+
+def get_linear_activations_layout_from_node(
+    node: NNCFNode, port_id: int, input_shape: Tuple[int]
+) -> Tuple[ONNXLayoutElem]:
+    """
+    Calculates activations layout for a target linear node.
+
+    :param node: Target linear node.
+    :param port_id: Target input port ID.
+    :param input_shape: Shape of the input.
+    :return: Target linear Node weight layout.
+    """
+    act_layer_attrs = node.layer_attributes.input_attributes
+    return get_linear_input_layout(
+        input_shape=input_shape,
+        transpose=act_layer_attrs["transpose"],
+        port_id=port_id,
+    )
+
+
+def get_conv_weights_layout(ONNX_metatype: ONNXOpMetatype, weights_shape: Tuple[int, ...]) -> Tuple[ONNXLayoutElem]:
+    """
+    Calculates weights layout for a target convolution node.
+
+    :param ONNX_metatype: Target convolution node OpenVINO metatype.
+    :param weights_shape: Shape of the target convolution node weight.
+    :return: Target convolution node weights layout.
+    """
+    base_layout = _CONV_BASE_CONST_LAYOUT[ONNX_metatype]
+    kernel_size = weights_shape[len(base_layout) :]
+    weights_layout = list(base_layout) + [ONNXLayoutElem.SPATIAL] * len(kernel_size)
+    return tuple(weights_layout)
+
+
+def get_linear_input_layout(input_shape: Tuple[int, ...], transpose: bool, port_id: int) -> Tuple[ONNXLayoutElem]:
+    """
+    Calculates input layout for a target linear node.
+
+    :param input_shape: Shape of the target linear node input.
+    :param port_id: Port id of the target linear node input.
+    :return: Target linear node input layout.
+    """
+    input_layout = [ONNXLayoutElem.SPATIAL] * (len(input_shape) - 2)
+    if len(input_shape) > 1:
+        if (transpose and port_id == 0) or (not transpose and port_id == 1):
+            input_layout += [ONNXLayoutElem.C_IN, ONNXLayoutElem.C_OUT]
+        else:
+            input_layout += [ONNXLayoutElem.C_OUT, ONNXLayoutElem.C_IN]
+    else:
+        input_layout += [ONNXLayoutElem.C_IN]
+    return tuple(input_layout)
+
+
+def _get_constant_port_id_from_layer_attributes(layer_attributes: ONNXLayerAttributes) -> int:
+    """
+    Returns constant ports id for convolutional and linear ops layer attributes.
+
+    :param layer_attributes: Target convolutional/linear layer op layer attributes.
+    :return: Constant port id for the target convolutional/linear model.
+    """
+    port_ids = list(layer_attributes.constant_attributes.keys())
+    assert len(port_ids) == 1
+    return port_ids[0]
diff --git a/nncf/onnx/graph/metatypes/groups.py b/nncf/onnx/graph/metatypes/groups.py
index a721c6ae175..89caaddd177 100644
--- a/nncf/onnx/graph/metatypes/groups.py
+++ b/nncf/onnx/graph/metatypes/groups.py
@@ -164,3 +164,9 @@
     onnx_metatypes.ONNXROIAlignMetatype,
     onnx_metatypes.ONNXEmbeddingMetatype,
 ]
+
+CONV_OPERATIONS = [
+    onnx_metatypes.ONNXConvolutionMetatype,
+    onnx_metatypes.ONNXDepthwiseConvolutionMetatype,
+    onnx_metatypes.ONNXGroupConvolutionMetatype,
+]
diff --git a/nncf/onnx/graph/node_utils.py b/nncf/onnx/graph/node_utils.py
index bb21e10c603..ab79616b940 100644
--- a/nncf/onnx/graph/node_utils.py
+++ b/nncf/onnx/graph/node_utils.py
@@ -9,7 +9,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Dict, Optional, Tuple
+from typing import Dict, Optional, Tuple, List
 
 import numpy as np
 import onnx
@@ -18,10 +18,20 @@
 from nncf.common.graph.graph import NNCFNode
 from nncf.common.graph.transformations.commands import TargetType
 from nncf.common.logging.logger import nncf_logger
+from nncf.onnx.graph.layout import OVLayoutElem
+from nncf.onnx.graph.layout import get_conv_weights_layout
+from nncf.onnx.graph.layout import get_conv_weights_layout_from_node
+from nncf.onnx.graph.layout import get_linear_activations_layout_from_node
+from nncf.onnx.graph.layout import get_linear_input_layout
+from nncf.onnx.graph.layout import get_linear_weights_layout_from_node
 from nncf.onnx.graph.metatypes import onnx_metatypes as om
 from nncf.onnx.graph.metatypes.onnx_metatypes import ONNXDequantizeLinearMetatype
+from nncf.onnx.graph.metatypes.onnx_metatypes import ONNXMatMulMetatype
 from nncf.onnx.graph.onnx_helper import get_tensor_value
 from nncf.onnx.graph.transformations.commands import ONNXTargetPoint
+from nncf.onnx.graph.metatypes.groups import CONV_OPERATIONS
+from nncf.onnx.graph.metatypes.groups import OPERATIONS_WITH_BIAS
+from nncf.onnx.graph.metatypes.groups import OPERATIONS_WITH_WEIGHTS
 
 
 def is_node_with_bias(node: NNCFNode) -> bool:
@@ -139,6 +149,36 @@ def get_weight_quantization_axis(node: NNCFNode, port_id: int) -> int:
         weight_channel_axis = -1 - port_id if transpose else -2 + port_id
     return weight_channel_axis
 
+def get_weight_channel_axes(node: NNCFNode) -> List[int]:
+    """
+    Returns axes numbers of the weight tensor which correspond to its channels.
+
+    :param node: NNCFNode with weights.
+    :param weights_port_id: Weight port id of the target node.
+    :return: Axes numbers of the weight tensor which correspond to its channels.
+    """
+    if node.metatype not in OPERATIONS_WITH_WEIGHTS:
+        msg = "Channel axis cannot be defined for operation without weights."
+        raise ValueError(msg)
+
+    if node.metatype in CONV_OPERATIONS:
+        weights_layout = get_conv_weights_layout_from_node(node)
+        return [idx for idx, elem in enumerate(weights_layout) if elem in [OVLayoutElem.GROUPS, OVLayoutElem.C_OUT]]
+    elif node.metatype == ONNXMatMulMetatype:
+        return get_matmul_channel_axes(node)
+    return node.metatype.const_channel_axis
+
+
+def get_matmul_channel_axes(node: ov.Node) -> List[int]:
+    """
+    Calculate channel axes for the MatMul operation.
+
+    :param node: The target node.
+    :return: List of channel axes for the MatMul operation.
+    """
+    weights_layout = get_linear_weights_layout_from_node(node)
+    return [idx for idx, elem in enumerate(weights_layout) if elem in [OVLayoutElem.SPATIAL, OVLayoutElem.C_OUT]]
+
 
 def get_act_quantization_axis(node: NNCFNode, port_id: int) -> int:
     """
@@ -214,3 +254,7 @@ def get_quantized_tensor_shape(
     if target_point.is_weight_target_point():
         return node.layer_attributes.weight_attrs[target_point.port_id]["shape"]
     return _get_activation_tensor_shape(nncf_graph, node, target_point)
+
+
+def get_const_value_as_onnx_tensor(initializer_name: str, model: onnx.ModelProto) -> np.ndarray:
+    # TODO 
\ No newline at end of file
diff --git a/nncf/onnx/quantization/quantize_model.py b/nncf/onnx/quantization/quantize_model.py
index 6fe92206444..cc1816f4932 100644
--- a/nncf/onnx/quantization/quantize_model.py
+++ b/nncf/onnx/quantization/quantize_model.py
@@ -9,29 +9,40 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from pathlib import Path
 from typing import Any, Callable, Iterable, List, Optional, Tuple, TypeVar, Union
 
 import onnx
 
 import nncf
+from nncf.common.factory import NNCFGraphFactory
+from nncf.common.factory import StatisticsAggregatorFactory
 from nncf.common.logging.logger import nncf_logger
 from nncf.common.quantization.structs import QuantizationPreset
 from nncf.data import Dataset
 from nncf.onnx.graph.metatypes.groups import OPERATIONS_OUTPUT_HAS_NO_BATCH_AXIS
 from nncf.onnx.graph.nncf_graph_builder import GraphConverter
+from nncf.parameters import BackupMode
+from nncf.parameters import CompressionFormat
+from nncf.parameters import CompressWeightsMode
 from nncf.parameters import DropType
 from nncf.parameters import ModelType
 from nncf.parameters import QuantizationMode
+from nncf.parameters import SensitivityMetric
 from nncf.parameters import TargetDevice
 from nncf.quantization.advanced_parameters import AdvancedAccuracyRestorerParameters
+from nncf.quantization.advanced_parameters import AdvancedCompressionParameters
 from nncf.quantization.advanced_parameters import AdvancedQuantizationParameters
 from nncf.quantization.advanced_parameters import QuantizationParameters
 from nncf.quantization.algorithms.accuracy_control.algorithm import QuantizationAccuracyRestorer
 from nncf.quantization.algorithms.accuracy_control.algorithm import calculate_accuracy_drop
 from nncf.quantization.algorithms.accuracy_control.evaluator import Evaluator
 from nncf.quantization.algorithms.post_training.algorithm import PostTrainingQuantization
+from nncf.quantization.algorithms.weight_compression.algorithm import WeightCompression
 from nncf.quantization.quantize_model import quantize_with_tune_hyperparams
 from nncf.quantization.quantize_model import warning_model_no_batchwise_support
+from nncf.quantization.statistics_caching import cache_weight_compression_statistics
+from nncf.quantization.statistics_caching import register_statistics_for_algorithm
 from nncf.scopes import IgnoredScope
 
 TTensor = TypeVar("TTensor")
@@ -201,3 +212,63 @@ def quantize_with_accuracy_control_impl(
         )
 
     return quantized_model
+
+def compress_weights_impl(
+    model: onnx.ModelProto,
+    dataset: Dataset,
+    mode: CompressWeightsMode,
+    ratio: float,
+    group_size: int,
+    ignored_scope: IgnoredScope,
+    all_layers: bool,
+    sensitivity_metric: SensitivityMetric,
+    awq: bool,
+    subset_size: int,
+    scale_estimation: bool,
+    gptq: bool,
+    lora_correction: bool,
+    backup_mode: BackupMode,
+    compression_format: CompressionFormat,
+    advanced_parameters: Optional[AdvancedCompressionParameters] = None,
+) -> onnx.ModelProto:
+    """
+    Implementation of the `compress_weights()` method for the OpenVINO backend.
+    """
+    graph = NNCFGraphFactory.create(model)
+    compression_algorithm = WeightCompression(
+        mode,
+        ratio,
+        group_size,
+        ignored_scope,
+        all_layers,
+        sensitivity_metric,
+        awq,
+        subset_size,
+        scale_estimation,
+        gptq,
+        lora_correction,
+        backup_mode,
+        compression_format,
+        advanced_parameters,
+    )
+
+    statistics_points = None
+    if advanced_parameters and advanced_parameters.statistics_path:
+        # If there is no such directory, then caches statistics
+        statistics_path = Path(advanced_parameters.statistics_path)
+        if not statistics_path.exists():
+            cache_weight_compression_statistics(model, graph, dataset, subset_size, statistics_path)
+        statistics_aggregator = StatisticsAggregatorFactory.create(model, dataset)
+        compression_algorithm.set_backend_entity(model)
+        _, matmul_input_to_output_nodes_map = compression_algorithm.get_compression_nodes_info(graph)
+        register_statistics_for_algorithm(
+            statistics_aggregator,
+            model,
+            graph,
+            compression_algorithm,
+            matmul_input_to_output_nodes_map,
+        )
+        statistics_aggregator.load_statistics_from_dir(statistics_path)
+        statistics_points = statistics_aggregator.statistic_points
+
+    return compression_algorithm.apply(model, graph, statistics_points, dataset)
diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py
index 6169f4bada3..ae945f2eb1a 100644
--- a/nncf/quantization/algorithms/weight_compression/algorithm.py
+++ b/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -323,6 +323,10 @@ def set_backend_entity(self, model: TModel) -> None:
             from nncf.quantization.algorithms.weight_compression.torch_fx_backend import FXWeightCompressionAlgoBackend
 
             self._backend_entity = FXWeightCompressionAlgoBackend()
+        elif model_backend == BackendType.ONNX:
+            from nncf.quantization.algorithms.weight_compression.onnx_backend import ONNXWeightCompressionAlgoBackend
+            
+            self._backend_entity = ONNXWeightCompressionAlgoBackend()
         else:
             msg = f"Cannot return backend-specific entity because {model_backend.value} is not supported!"
             raise nncf.UnsupportedBackendError(msg)
diff --git a/nncf/quantization/algorithms/weight_compression/onnx_backend.py b/nncf/quantization/algorithms/weight_compression/onnx_backend.py
new file mode 100644
index 00000000000..a25c22e0916
--- /dev/null
+++ b/nncf/quantization/algorithms/weight_compression/onnx_backend.py
@@ -0,0 +1,539 @@
+# Copyright (c) 2025 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Dict, Iterable, List, Optional, Tuple
+
+import onnx
+
+import nncf
+from nncf.common.graph import NNCFGraph
+from nncf.common.graph import NNCFNode
+from nncf.common.graph.operator_metatypes import OperatorMetatype
+from nncf.common.graph.transformations.commands import TargetType
+from nncf.common.graph.utils import get_reduction_axes
+from nncf.common.tensor_statistics.statistic_point import StatisticPoint
+from nncf.common.utils.caching import disable_results_caching
+from nncf.experimental.common.tensor_statistics.collectors import MeanAggregator
+from nncf.experimental.common.tensor_statistics.collectors import NoopAggregator
+from nncf.experimental.common.tensor_statistics.collectors import TensorCollector
+from nncf.experimental.common.tensor_statistics.statistics import MaxVarianceTensorStatistic
+from nncf.experimental.common.tensor_statistics.statistics import MeanMagnitudeTensorStatistic
+from nncf.experimental.common.tensor_statistics.statistics import MeanVarianceTensorStatistic
+from nncf.experimental.common.tensor_statistics.statistics import WCTensorStatistic
+
+from nncf.onnx.graph.metatypes import onnx_metatypes as om
+from nncf.onnx.graph.metatypes.groups import ATOMIC_ACTIVATIONS_OPERATIONS
+from nncf.onnx.graph.model_transformer import ONNXModelTransformer
+from nncf.onnx.graph.node_utils import convert_op
+from nncf.onnx.graph.node_utils import create_ov_const_from_tensor
+from nncf.onnx.graph.node_utils import get_const_value_as_numpy_tensor
+from nncf.onnx.graph.node_utils import get_const_value_as_onnx_tensor
+from nncf.onnx.graph.node_utils import get_weight_channel_axes
+from nncf.onnx.graph.transformations.command_creation import OVCommandCreator
+from nncf.onnx.graph.transformations.commands import OVTargetPoint
+from nncf.onnx.optimized_functions import clear_onnx_model_cache
+from nncf.onnx.optimized_functions.models import ONNX_MODEL_CACHE
+from nncf.onnx.rt_info import dump_parameters
+from nncf.onnx.statistics.collectors import OVMaxVarianceReducer
+from nncf.onnx.statistics.collectors import OVMeanAbsMaxReducer
+from nncf.onnx.statistics.collectors import OVMeanReducer
+from nncf.onnx.statistics.collectors import OVMeanVarianceReducer
+from nncf.onnx.statistics.collectors import OVShapeReducer
+
+
+from nncf.parameters import CompressionFormat
+from nncf.parameters import CompressWeightsMode
+from nncf.quantization.advanced_parameters import AdvancedCompressionParameters
+from nncf.quantization.algorithms.weight_compression.awq_patterns import get_awq_patterns
+from nncf.quantization.algorithms.weight_compression.backend import AWQAlgoBackend
+from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend
+from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
+from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
+from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
+from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
+from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight
+from nncf.tensor import Tensor
+from nncf.tensor.definitions import TensorDataType
+from nncf.tensor.functions.onnx_numeric import DTYPE_MAP_REV
+
+
+class ONNXWeightCompressionAlgoBackend(WeightCompressionAlgoBackend):
+
+    @property
+    def matmul_metatypes(self) -> List[OperatorMetatype]:
+        return [om.ONNXMatMulMetatype]
+
+    @property
+    def convolution_metatypes(self) -> List[OperatorMetatype]:
+        # TODO: Add more convolution metatypes
+        return [
+            om.ONNXConvolutionMetatype,
+            om.ONNXDepthwiseConvolutionMetatype,
+            om.ONNXGroupConvolutionMetatype,
+        ]
+
+    @property
+    def embedding_metatypes(self) -> List[OperatorMetatype]:
+        return [om.ONNXEmbeddingMetatype]
+
+    @staticmethod
+    def is_node_with_weights(node: NNCFNode, graph: NNCFGraph) -> bool:
+        return node.layer_attributes and node.layer_attributes.constant_attributes
+
+    @staticmethod
+    def get_reduction_axes(node_with_weight: NNCFNode, weight_port_id: int, graph: NNCFGraph) -> Optional[Tuple[int]]:
+        channel_axes = get_weight_channel_axes(node_with_weight)
+        const_shape = node_with_weight.layer_attributes.constant_attributes[weight_port_id]["shape"]
+        return get_reduction_axes(channel_axes, const_shape)
+
+    @staticmethod
+    def target_point(target_type: TargetType, target_node_name: str, port_id: int) -> OVTargetPoint:
+        return OVTargetPoint(target_type, target_node_name, port_id)
+
+    def mean_statistic_collector(
+        self, reduction_axes: Tuple[int], subset_size: Optional[int] = None
+    ) -> TensorCollector:
+        mean_reducer = ONNXMeanReducer(reduction_axes, inplace=True)
+        shape_reducer = ONNXShapeReducer(inplace=True)
+        collector = TensorCollector(WCTensorStatistic)
+        collector.register_statistic_branch(WCTensorStatistic.MEAN_STAT, mean_reducer, NoopAggregator(subset_size))
+        collector.register_statistic_branch(WCTensorStatistic.SHAPE_STAT, shape_reducer, NoopAggregator(subset_size))
+        return collector
+
+    @staticmethod
+    def get_activation_port_id(node: NNCFNode, nncf_graph: NNCFGraph) -> int:
+        if node.layer_attributes.input_attributes["transpose"]:
+            msg = "Transposed input is not supported"
+            raise nncf.UnsupportedModelError(msg)
+        constant_ports = node.layer_attributes.get_const_port_ids()
+        activation_ports = [
+            e.input_port_id for e in nncf_graph.get_input_edges(node) if e.input_port_id not in constant_ports
+        ]
+        assert len(activation_ports) == 1
+        return activation_ports[0]
+
+    @staticmethod
+    def get_weight_names_and_port_ids(node: NNCFNode, graph: NNCFGraph) -> List[Tuple[str, int]]:
+        result = []
+        for weight_port_id in node.layer_attributes.get_const_port_ids():
+            weight_name = node.layer_attributes.constant_attributes[weight_port_id]["name"]
+            result.append((weight_name, weight_port_id))
+        return result
+
+    def get_weight(self, node_with_weight: NNCFNode, weight_port_id: int, model: onnx.ModelProtoo, graph: NNCFGraph) -> Tensor:
+        weight_name = node_with_weight.layer_attributes.constant_attributes[weight_port_id]["name"]
+        weight_node = self.name_to_node_mapping[weight_name]
+        weight_tensor = get_const_value_as_numpy_tensor(weight_node)
+        return Tensor(weight_tensor)
+
+    def get_weight_dtype(
+        self, node_with_weight: NNCFNode, weight_port_id: int, model: onnx.ModelProto, graph: NNCFGraph
+    ) -> TensorDataType:
+        onnx_type_name = node_with_weight.layer_attributes.constant_attributes[weight_port_id]["dtype"]
+        onnx_type = getattr(onnx.TensorProto.DataType, onnx_type_name)
+        return DTYPE_MAP_REV[onnx_type]
+
+    @staticmethod
+    def get_weight_shape(node_with_weight: NNCFNode, weight_port_id: int, graph: NNCFGraph) -> Tuple:
+        return node_with_weight.layer_attributes.constant_attributes[weight_port_id]["shape"]
+
+    def _create_compression_subgraph(
+        self,
+        weight: Tensor,
+        compression_config: WeightCompressionConfig,
+        reduction_axes: Tuple[int, ...],
+        const_node_name: str,
+        weight_port_id: int,
+        const_dtype,
+        should_add_convert_node: bool,
+        layer_scales: Optional[Tensor] = None,
+        layer_zero_points: Optional[Tensor] = None,
+    ):        
+        if compression_config.mode == CompressWeightsMode.INT8_SYM:
+            compression_dtype = onnx.TensorProto.INT8
+            is_symmetric = True
+        elif compression_config.mode == CompressWeightsMode.INT8_ASYM:
+            compression_dtype = onnx.TensorProto.UINT8
+            is_symmetric = False
+        elif compression_config.mode in [CompressWeightsMode.INT4_SYM, CompressWeightsMode.INT4_ASYM, 
+                                        CompressWeightsMode.NF4, CompressWeightsMode.E2M1]:
+            msg = f"{compression_config.mode.value} is not directly supported in ONNX backend yet."
+            raise nncf.ParameterNotSupportedError(msg)
+        else:
+            msg = f"{compression_config.mode.value} is not supported."
+            raise nncf.ParameterNotSupportedError(msg)
+
+        original_shape = weight.shape
+        
+        compressed_weight = compress_weight(
+            weight,
+            reduction_axes,
+            compression_config,
+            layer_scales,
+            layer_zero_points,
+        )
+        
+        prefix = f"{const_node_name}_{weight_port_id}"
+        
+        compressed_tensor_name = f"{prefix}_compressed"
+        if is_symmetric:
+            compressed_data = compressed_weight.tensor.data.astype(np.int8)
+        else:
+            compressed_data = compressed_weight.tensor.data.astype(np.uint8)
+        
+        compressed_initializer = numpy_helper.from_array(compressed_data, compressed_tensor_name)
+        self.new_initializers.append(compressed_initializer)
+        
+        # 创建缩放因子的初始化器
+        scale_tensor_name = f"{prefix}_scale"
+        scale_data = compressed_weight.scale.data
+        scale_initializer = numpy_helper.from_array(scale_data.astype(np.float32), scale_tensor_name)
+        self.new_initializers.append(scale_initializer)
+        
+        # 执行解量化操作
+        # 步骤1: Cast操作 - 将压缩的整数转换为浮点数
+        cast_output_name = f"{prefix}_casted"
+        cast_node = onnx.helper.make_node(
+            'Cast',
+            inputs=[compressed_tensor_name],
+            outputs=[cast_output_name],
+            name=f"{prefix}_cast",
+            to=onnx.TensorProto.FLOAT  # 转换为浮点数
+        )
+        self.new_nodes.append(cast_node)
+        
+        # 对于非对称量化，处理零点
+        if not is_symmetric and compressed_weight.zero_point is not None:
+            # 创建零点的初始化器
+            zero_point_tensor_name = f"{prefix}_zero_point"
+            zero_point_data = compressed_weight.zero_point.data
+            if is_symmetric:
+                zero_point_data = zero_point_data.astype(np.int8)
+            else:
+                zero_point_data = zero_point_data.astype(np.uint8)
+            
+            zero_point_initializer = numpy_helper.from_array(zero_point_data, zero_point_tensor_name)
+            self.new_initializers.append(zero_point_initializer)
+            
+            # 创建零点的Cast节点
+            zp_cast_output_name = f"{prefix}_zp_casted"
+            zp_cast_node = onnx.helper.make_node(
+                'Cast',
+                inputs=[zero_point_tensor_name],
+                outputs=[zp_cast_output_name],
+                name=f"{prefix}_zp_cast",
+                to=onnx.TensorProto.FLOAT
+            )
+            self.new_nodes.append(zp_cast_node)
+            
+            # 减去零点
+            sub_output_name = f"{prefix}_sub_zp"
+            sub_node = onnx.helper.make_node(
+                'Sub',
+                inputs=[cast_output_name, zp_cast_output_name],
+                outputs=[sub_output_name],
+                name=f"{prefix}_sub"
+            )
+            self.new_nodes.append(sub_node)
+            
+            # 更新当前输出名称，用于下一步乘以缩放因子
+            current_output_name = sub_output_name
+        else:
+            # 如果是对称量化，不需要减去零点
+            current_output_name = cast_output_name
+        
+        # 乘以缩放因子
+        mul_output_name = f"{prefix}_dequantized"
+        mul_node = onnx.helper.make_node(
+            'Mul',
+            inputs=[current_output_name, scale_tensor_name],
+            outputs=[mul_output_name],
+            name=f"{prefix}_mul"
+        )
+        self.new_nodes.append(mul_node)
+        
+        # 处理分组量化的重塑操作
+        if compression_config.group_size != -1:
+            reshape_output_name = f"{prefix}_reshaped"
+            # 创建原始形状的初始化器
+            shape_tensor_name = f"{prefix}_shape"
+            shape_data = np.array(original_shape, dtype=np.int64)
+            shape_initializer = numpy_helper.from_array(shape_data, shape_tensor_name)
+            self.new_initializers.append(shape_initializer)
+            
+            # 创建Reshape节点
+            reshape_node = onnx.helper.make_node(
+                'Reshape',
+                inputs=[mul_output_name, shape_tensor_name],
+                outputs=[reshape_output_name],
+                name=f"{prefix}_reshape"
+            )
+            self.new_nodes.append(reshape_node)
+            current_output_name = reshape_output_name
+        else:
+            current_output_name = mul_output_name
+        
+        # 如果需要，添加额外的类型转换节点
+        if should_add_convert_node:
+            final_output_name = f"{prefix}_final"
+            convert_node = onnx.helper.make_node(
+                'Cast',
+                inputs=[current_output_name],
+                outputs=[final_output_name],
+                name=f"{prefix}_final_cast",
+                to=self._convert_dtype_to_onnx(const_dtype)  # 转换为原始常量的数据类型
+            )
+            self.new_nodes.append(convert_node)
+            final_node = convert_node
+        else:
+            final_output_name = current_output_name
+            # 找到最后创建的节点
+            final_node = self.new_nodes[-1]
+        
+        return final_node, compressed_weight
+    
+    def _convert_dtype_to_onnx(self, dtype):
+        """将内部数据类型转换为ONNX的数据类型"""
+        dtype_mapping = {
+            "float32": onnx.TensorProto.FLOAT,
+            "float16": onnx.TensorProto.FLOAT16,
+            "int8": onnx.TensorProto.INT8,
+            "uint8": onnx.TensorProto.UINT8,
+        }
+        
+        if isinstance(dtype, str):
+            if dtype in dtype_mapping:
+                return dtype_mapping[dtype]
+        
+        if isinstance(dtype, int) and dtype in [item for item in dtype_mapping.values()]:
+            return dtype
+        
+        return onnx.TensorProto.FLOAT
+
+    def transform_model(
+        self,
+        model: onnx.ModelProto,
+        graph: NNCFGraph,
+        weight_compression_parameters: Iterable[WeightCompressionParameters],
+        precomputed_scales: Dict[str, Tensor] = None,
+        precomputed_zero_points: Dict[str, Tensor] = None,
+        lora_correction_algo: LoraCorrectionAlgorithm = None,
+        compression_format: CompressionFormat = CompressionFormat.DQ,
+        advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(),
+    ) -> onnx.ModelProto:
+        # TODO 完成 ONNXWeightCompressionAlgoBackend 类中的函数， 只考虑最简单的 weightcompression
+        compressed_model = onnx.ModelProto()
+        compressed_model.CopyFrom(model)
+        onnx_graph = compressed_model.graph
+        
+        # Create mappings for easier lookup
+        initializers = {init.name: (i, init) for i, init in enumerate(onnx_graph.initializer)}
+        nodes = {node.name: node for node in onnx_graph.node}
+        
+        # Track newly added nodes and initializers
+        new_nodes = []
+        new_initializers = []
+        
+        for wc_params in weight_compression_parameters:
+            # Get weight node information
+            weight_node = get_const_node(wc_params.node_with_weight, wc_params.weight_port_id, graph)
+            weight_name = weight_node.layer_attributes.name
+            
+            # Check if weight exists in initializers
+            if weight_name not in initializers:
+                msg = f"Could not find weight tensor '{weight_name}' in ONNX model initializers."
+                raise nncf.InternalError(msg)
+            
+            # Get weight data from ONNX model
+            _, weight_initializer = initializers[weight_name]
+            weight_np = numpy_helper.to_array(weight_initializer)
+            weight = Tensor(weight_np)
+            
+            # Check if compression mode is supported
+            compression_config = wc_params.compression_config
+            if compression_config.mode in [
+                CompressWeightsMode.NF4,
+                CompressWeightsMode.E2M1,
+            ]:
+                msg = f"{compression_config.mode.value} is not supported for ONNX backend."
+                raise nncf.ParameterNotSupportedError(msg)
+            
+            # Find nodes that use this weight
+            consumer_nodes = []
+            for node in onnx_graph.node:
+                if weight_name in node.input:
+                    consumer_nodes.append(node)
+            
+            if not consumer_nodes:
+                continue  # Skip if no nodes use this weight
+            
+            # Compress weight
+            compressed_weight = compress_weight(
+                weight,
+                wc_params.reduction_axes,
+                compression_config,
+                None if precomputed_scales is None else precomputed_scales.get(wc_params.weight_name),
+                None if precomputed_zero_points is None else precomputed_zero_points.get(wc_params.weight_name),
+            )
+            
+            # Create decompression subgraph based on compression mode
+            if compression_config.mode == CompressWeightsMode.INT8_SYM:
+                # Create compressed weight initializer (int8)
+                compressed_data = compressed_weight.tensor.data.numpy().astype(np.int8)
+                compressed_weight_name = f"{weight_name}_compressed"
+                compressed_initializer = numpy_helper.from_array(compressed_data, compressed_weight_name)
+                new_initializers.append(compressed_initializer)
+                
+                # Create scale initializer
+                scale_data = compressed_weight.scale.data.numpy()
+                scale_name = f"{weight_name}_scale"
+                scale_initializer = numpy_helper.from_array(scale_data, scale_name)
+                new_initializers.append(scale_initializer)
+                
+                # Create cast node (int8 -> float)
+                cast_output_name = f"{weight_name}_casted"
+                cast_node = onnx.helper.make_node(
+                    'Cast',
+                    inputs=[compressed_weight_name],
+                    outputs=[cast_output_name],
+                    name=f"{weight_name}_cast",
+                    to=onnx.TensorProto.FLOAT
+                )
+                new_nodes.append(cast_node)
+                
+                # Create multiplication node (apply scale)
+                dequantized_name = f"{weight_name}_dequantized"
+                mul_node = onnx.helper.make_node(
+                    'Mul',
+                    inputs=[cast_output_name, scale_name],
+                    outputs=[dequantized_name],
+                    name=f"{weight_name}_dequant"
+                )
+                new_nodes.append(mul_node)
+                
+                # Replace weight usage in consumer nodes
+                for node in consumer_nodes:
+                    for i, input_name in enumerate(node.input):
+                        if input_name == weight_name:
+                            node.input[i] = dequantized_name
+                
+            elif compression_config.mode == CompressWeightsMode.INT8_ASYM:
+                # Create compressed weight initializer (uint8)
+                compressed_data = compressed_weight.tensor.data.numpy().astype(np.uint8)
+                compressed_weight_name = f"{weight_name}_compressed"
+                compressed_initializer = numpy_helper.from_array(compressed_data, compressed_weight_name)
+                new_initializers.append(compressed_initializer)
+                
+                # Create scale initializer
+                scale_data = compressed_weight.scale.data.numpy()
+                scale_name = f"{weight_name}_scale"
+                scale_initializer = numpy_helper.from_array(scale_data, scale_name)
+                new_initializers.append(scale_initializer)
+                
+                # Create zero point initializer
+                zero_point_data = compressed_weight.zero_point.data.numpy()
+                zero_point_name = f"{weight_name}_zero_point"
+                zero_point_initializer = numpy_helper.from_array(zero_point_data, zero_point_name)
+                new_initializers.append(zero_point_initializer)
+                
+                # Create cast node (uint8 -> float)
+                cast_output_name = f"{weight_name}_casted"
+                cast_node = onnx.helper.make_node(
+                    'Cast',
+                    inputs=[compressed_weight_name],
+                    outputs=[cast_output_name],
+                    name=f"{weight_name}_cast",
+                    to=onnx.TensorProto.FLOAT
+                )
+                new_nodes.append(cast_node)
+                
+                # Create subtraction node (subtract zero point)
+                sub_output_name = f"{weight_name}_sub_zp"
+                sub_node = onnx.helper.make_node(
+                    'Sub',
+                    inputs=[cast_output_name, zero_point_name],
+                    outputs=[sub_output_name],
+                    name=f"{weight_name}_sub_zp"
+                )
+                new_nodes.append(sub_node)
+                
+                # Create multiplication node (apply scale)
+                dequantized_name = f"{weight_name}_dequantized"
+                mul_node = onnx.helper.make_node(
+                    'Mul',
+                    inputs=[sub_output_name, scale_name],
+                    outputs=[dequantized_name],
+                    name=f"{weight_name}_dequant"
+                )
+                new_nodes.append(mul_node)
+                
+                # Replace weight usage in consumer nodes
+                for node in consumer_nodes:
+                    for i, input_name in enumerate(node.input):
+                        if input_name == weight_name:
+                            node.input[i] = dequantized_name
+                            
+            elif compression_config.mode in [CompressWeightsMode.INT4_SYM, CompressWeightsMode.INT4_ASYM]:
+                # For INT4 formats, since ONNX doesn't natively support int4,
+                # we need to pack int4 values into int8 or uint8 tensors
+                # This requires careful handling of the unpacking logic
+                
+                # Implementation would be similar to INT8 cases but with additional handling
+                # for the packed format and unpacking operations
+                
+                # For this implementation, let's assume INT4 is not fully supported yet
+                msg = f"{compression_config.mode.value} is not fully implemented for ONNX backend yet."
+                raise nncf.ParameterNotSupportedError(msg)
+        
+        # Add new initializers and nodes to the graph
+        for initializer in new_initializers:
+            onnx_graph.initializer.append(initializer)
+        
+        for node in new_nodes:
+            onnx_graph.node.append(node)
+        
+        # Remove original weight initializers that have been replaced
+        # This is optional but helps reduce model size
+        original_initializers_to_keep = []
+        for i, initializer in enumerate(onnx_graph.initializer):
+            if any(initializer.name in node.input for node in onnx_graph.node):
+                original_initializers_to_keep.append(initializer)
+        
+        # Clear and re-add initializers
+        del onnx_graph.initializer[:]
+        for initializer in original_initializers_to_keep + new_initializers:
+            if initializer not in onnx_graph.initializer:
+                onnx_graph.initializer.append(initializer)
+        
+        # Verify the model is valid
+        try:
+            onnx.checker.check_model(compressed_model)
+        except Exception as e:
+            raise nncf.InternalError(f"Generated ONNX model is invalid: {str(e)}")
+        
+        return compressed_model
+
+    @staticmethod
+    def dump_parameters(
+        model: onnx.ModelProto, parameters: Dict, algo_name: Optional[str] = "quantization", path: Optional[List] = None
+    ) -> None:
+        dump_parameters(model, parameters, algo_name, path)
+
+    @staticmethod
+    def get_filter_fn_for_statistics(activation_port_id: int, algorithm_key: str) -> Callable[[StatisticPoint], bool]:
+        def filter_func(point: StatisticPoint) -> bool:
+            return (
+                algorithm_key in point.algorithm_to_tensor_collectors
+                and point.target_point.type == TargetType.POST_LAYER_OPERATION
+                and point.target_point.port_id == activation_port_id
+            )
+
+        return filter_func
+
diff --git a/nncf/quantization/quantize_model.py b/nncf/quantization/quantize_model.py
index 01d770262c0..5a4af2b9bd8 100644
--- a/nncf/quantization/quantize_model.py
+++ b/nncf/quantization/quantize_model.py
@@ -619,6 +619,12 @@ def compress_weights(
             raise nncf.ParameterNotSupportedError(msg)
 
         compression_weights_impl = ov_compress_weights_impl
+
+    if backend == BackendType.ONNX:
+        from nncf.onnx.quantization.quantize_model import compress_weights_impl as onnx_compress_weights_impl
+
+        compression_weights_impl = onnx_compress_weights_impl
+
     check_user_compression_configuration(
         mode,
         subset_size,
diff --git a/nncf/tensor/functions/onnx_numeric.py b/nncf/tensor/functions/onnx_numeric.py
new file mode 100644
index 00000000000..1a271bf6d9c
--- /dev/null
+++ b/nncf/tensor/functions/onnx_numeric.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2025 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Tuple, Union
+
+import onnx
+from numpy.typing import NDArray
+
+from nncf.tensor import Tensor
+from nncf.tensor import TensorDataType
+from nncf.tensor.definitions import TensorBackend
+from nncf.tensor.definitions import TensorDeviceType
+from nncf.tensor.definitions import TypeInfo
+from nncf.tensor.functions import numeric
+
+ONNX_DTYPE_MAP: Dict[TensorDataType, int] = {
+    TensorDataType.float16: onnx.TensorProto.DataType.FLOAT16,
+    TensorDataType.bfloat16: onnx.TensorProto.DataType.BFLOAT16,
+    TensorDataType.float32: onnx.TensorProto.DataType.FLOAT,
+    TensorDataType.float64: onnx.TensorProto.DataType.DOUBLE,
+    TensorDataType.int8: onnx.TensorProto.DataType.INT8,
+    TensorDataType.int32: onnx.TensorProto.DataType.INT32,
+    TensorDataType.int64: onnx.TensorProto.DataType.INT64,
+    TensorDataType.uint8: onnx.TensorProto.DataType.UINT8,
+}
+
+DTYPE_MAP_REV = {v: k for k, v in ONNX_DTYPE_MAP.items()}
\ No newline at end of file