Disable optimized compression on ARM CPUs until the next release (#3366)

nikita-savelyevv · web-flow · commit 996b3089b730 · 2025-03-24T14:03:38.000+02:00
### Changes - Disabled optimized compression on ARM CPUs until the next OV 2025.2 release. The fix openvinotoolkit/openvino#29577 won't be merged in time. - Added `NNCF_DISABLE_OPTIMIZED_COMPRESSION` environment variable flag to disable optimized compression if needed. ### Reason for changes Enable weights compression on ARM CPUs. ### Related tickets 164135
diff --git a/nncf/openvino/cpu_info.py b/nncf/openvino/cpu_info.py
@@ -13,16 +13,34 @@
 
 import openvino as ov
 
+_IS_ARM_CPU = None
 _IS_LNL_CPU = None
 
 
+def _get_cpu_name() -> str:
+    """
+    :return: The name of the CPU.
+    """
+    return ov.Core().get_property("CPU", ov.properties.device.full_name)
+
+
+def is_arm_cpu() -> bool:
+    """
+    Checks whether current CPU is an ARM CPU or not.
+    :return: True if current CPU is an ARM CPU, False otherwise.
+    """
+    global _IS_ARM_CPU
+    if _IS_ARM_CPU is None:
+        _IS_ARM_CPU = "arm" in _get_cpu_name().lower()
+    return _IS_ARM_CPU
+
+
 def is_lnl_cpu() -> bool:
     """
     Checks whether current CPU is an Intel Lunar Lake generation or not.
     :return: True if current CPU is an Intel Lunar Lake generation, False otherwise.
     """
     global _IS_LNL_CPU
     if _IS_LNL_CPU is None:
-        cpu_name = ov.Core().get_property("CPU", ov.properties.device.full_name)
-        _IS_LNL_CPU = re.search(r"Ultra \d 2\d{2}", cpu_name) is not None
+        _IS_LNL_CPU = re.search(r"Ultra \d 2\d{2}", _get_cpu_name()) is not None
     return _IS_LNL_CPU
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -8,14 +8,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import os
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
 
 import numpy as np
 
 import nncf
 from nncf.common.logging.logger import nncf_logger
+from nncf.common.utils.backend import is_openvino_at_least
 from nncf.common.utils.backend import is_openvino_available
 from nncf.parameters import CompressWeightsMode
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
@@ -543,9 +544,15 @@ def quantize_dequantize_weight(
 
 
 def _can_run_optimized(input_backend: TensorBackend) -> bool:
-    if input_backend in [TensorBackend.ov, TensorBackend.numpy]:
+    if (
+        input_backend in [TensorBackend.ov, TensorBackend.numpy]
+        and os.environ.get("NNCF_DISABLE_OPTIMIZED_COMPRESSION") is None
+    ):
         if is_openvino_available():
-            return True
+            from nncf.openvino.cpu_info import is_arm_cpu
+
+            # Due to a bug in CPU plugin compression models can fail at compilation on ARM CPUs. Ticket: 164135.
+            return not is_arm_cpu() or is_openvino_at_least("2025.2")
         else:
             nncf_logger.info_once(
                 "OpenVINO optimizations are disabled. Install OpenVINO to enable them and improve the performance."
diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
@@ -12,6 +12,7 @@
 import inspect
 import os
 from typing import Callable, Dict, List
+from unittest.mock import patch
 
 import numpy as np
 import openvino.runtime as ov
@@ -21,10 +22,12 @@
 from openvino.runtime import opset13 as opset
 
 import nncf
+import nncf.openvino.optimized_functions as opt_fns
 from nncf import CompressWeightsMode
 from nncf import SensitivityMetric
 from nncf.common.factory import NNCFGraphFactory
 from nncf.common.utils.debug import nncf_debug
+from nncf.common.utils.helpers import set_env_variable
 from nncf.data.dataset import Dataset
 from nncf.experimental.common.tensor_statistics.collectors import AggregatorBase
 from nncf.openvino.graph.model_transformer import OVModelTransformer
@@ -1487,6 +1490,25 @@ def test_compression_with_transposed_activations(kwargs):
         )
 
 
+@pytest.mark.parametrize("disabled", [False, True])
+def test_disabled_optimized_compression(disabled):
+    model = LMLinearModel().ov_model
+
+    def run_compression():
+        compress_weights(model, mode=CompressWeightsMode.INT8)
+
+    fn_to_patch = opt_fns.do_int_quantization
+    patch_path = f"nncf.openvino.optimized_functions.{fn_to_patch.__name__}"
+    with patch(patch_path, side_effect=fn_to_patch) as mock:
+        if disabled:
+            with set_env_variable("NNCF_DISABLE_OPTIMIZED_COMPRESSION", "1"):
+                run_compression()
+            mock.assert_not_called()
+        else:
+            run_compression()
+            mock.assert_called_once()
+
+
 class TestOVTemplateWeightCompression(TemplateWeightCompression):
     @staticmethod
     def get_matmul_model() -> ov.Model: