[Conformance][TorchFX] GPU quantization support (#3010)

daniil-lyakhov · web-flow · commit cf36f3ff0f0e · 2025-02-03T18:00:01.000+02:00
### Changes

* --validate-in-backend CLI option is added
* CUDA_FX_TORCH backend is added to conformance test
* FXSQMultiply is updated to work on both CPU and GPU

### Tests

Local run:
CLI: `python -m pytest test_quantize_conformance.py -k CUDA_FX --data
path/to/imagenet`

| Model | Backend | Metric name | Metric value | Metric diff | Num FQ |
Num int4 | Num int8 | Compr. time | Total time | RAM MiB | Status |
Build url |

|-----------------------------------|---------------|-------------|--------------|-------------|--------|----------|----------|-------------|------------|---------|--------|-----------|
| torchvision/resnet18 | CUDA_FX_TORCH | Acc@1 | 0.6942 | -0.0036 | 30 |
0 | 21 | 0:00:02 | 0:04:14 | 1560 | | |
| torchvision/swin_v2_s | CUDA_FX_TORCH | Acc@1 | 0.83572 | -0.0014 |
149 | 0 | 101 | 0:00:55 | 0:17:24 | 3161 | | |
| torchvision/vit_b_16 | CUDA_FX_TORCH | Acc@1 | 0.80962 | -0.00108 | 62
| 0 | 50 | 0:00:19 | 0:13:39 | 2876 | | |
| torchvision/mobilenet_v3_small_BC | CUDA_FX_TORCH | Acc@1 | 0.66642 |
-0.01018 | 61 | 0 | 36 | 0:00:05 | 0:04:09 | 1653 | | |
diff --git a/nncf/experimental/torch/fx/constant_folding.py b/nncf/experimental/torch/fx/constant_folding.py
@@ -15,6 +15,8 @@
 import torch.fx
 import torch.utils._pytree as pytree
 
+from nncf.torch.utils import get_model_device
+
 aten = torch.ops.aten
 
 
@@ -246,28 +248,29 @@ def constant_fold(
     :param constraint_fn: Constraint function which takes a node and returs the constraint:
         should the node be constant folded or not.
     """
-    with torch.no_grad():
-        with torch.utils._python_dispatch._disable_current_modes():
-            cf = ConstantFolder(gm)
-            cf.run()
+    with torch.no_grad(), torch.utils._python_dispatch._disable_current_modes():
+        cf = ConstantFolder(gm)
+        cf.run()
 
-            for node, constant in cf.node_replacements.items():
-                if constraint_fn is not None and not constraint_fn(node):
-                    continue
-                _replace_node_with_constant(gm, node, constant)
-
-            erased_params = []
-            for node in gm.graph.find_nodes(op="get_attr"):
-                if len(node.users) == 0:
-                    if hasattr(gm, node.target):
-                        delattr(gm, node.target)
-                    erased_params.append(node)
-
-            for node in erased_params:
-                gm.graph.erase_node(node)
-
-            # Custom _is_impure function allows to eliminate all layers with zero
-            # users including inplace ops like relu_ besides output and placeholders.
-            gm.graph.eliminate_dead_code(_is_impure)
-            gm.graph.lint()
-            gm.recompile()
+        device = get_model_device(gm)
+        for node, constant in cf.node_replacements.items():
+            if constraint_fn is not None and not constraint_fn(node):
+                continue
+            constant = constant.to(device)
+            _replace_node_with_constant(gm, node, constant)
+
+        erased_params = []
+        for node in gm.graph.find_nodes(op="get_attr"):
+            if len(node.users) == 0:
+                if hasattr(gm, node.target):
+                    delattr(gm, node.target)
+                erased_params.append(node)
+
+        for node in erased_params:
+            gm.graph.erase_node(node)
+
+        # Custom _is_impure function allows to eliminate all layers with zero
+        # users including inplace ops like relu_ besides output and placeholders.
+        gm.graph.eliminate_dead_code(_is_impure)
+        gm.graph.lint()
+        gm.recompile()
diff --git a/nncf/quantization/algorithms/smooth_quant/torch_fx_backend.py b/nncf/quantization/algorithms/smooth_quant/torch_fx_backend.py
@@ -42,7 +42,8 @@
 class FXSQMultiply(torch.nn.Module):
     def __init__(self, scale: torch.Tensor):
         super().__init__()
-        self._scale_value = scale
+        self.register_buffer("_scale_value", scale)
+        self._scale_value: torch.Tensor
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return torch.mul(x, self._scale_value)
diff --git a/tests/post_training/conftest.py b/tests/post_training/conftest.py
@@ -19,6 +19,11 @@ def pytest_addoption(parser):
     parser.addoption("--fp32", action="store_true", help="Test original model")
     parser.addoption("--cuda", action="store_true", help="Enable CUDA_TORCH backend")
     parser.addoption("--benchmark", action="store_true", help="Run benchmark_app")
+    parser.addoption(
+        "--torch-compile-validation",
+        action="store_true",
+        help='Validate TorchFX quantized models via torch.compile(..., backend="openvino")',
+    )
     parser.addoption(
         "--extra-columns",
         action="store_true",
diff --git a/tests/post_training/data/ptq_reference_data.yaml b/tests/post_training/data/ptq_reference_data.yaml
@@ -38,6 +38,8 @@ torchvision/resnet18_backend_CUDA_TORCH:
   metric_value: 0.69152
 torchvision/resnet18_backend_FX_TORCH:
   metric_value: 0.6946
+torchvision/resnet18_backend_CUDA_FX_TORCH:
+  metric_value: 0.6946
 torchvision/mobilenet_v3_small_BC_backend_FP32:
   metric_value: 0.6766
 torchvision/mobilenet_v3_small_BC_backend_OV:
@@ -46,18 +48,24 @@ torchvision/mobilenet_v3_small_BC_backend_ONNX:
   metric_value: 0.6679
 torchvision/mobilenet_v3_small_BC_backend_FX_TORCH:
   metric_value: 0.6679
+torchvision/mobilenet_v3_small_BC_backend_CUDA_FX_TORCH:
+  metric_value: 0.6664
 torchvision/vit_b_16_backend_FP32:
   metric_value: 0.8107
 torchvision/vit_b_16_backend_OV:
   metric_value: 0.80948
 torchvision/vit_b_16_backend_FX_TORCH:
   metric_value: 0.80922
+torchvision/vit_b_16_backend_CUDA_FX_TORCH:
+  metric_value: 0.80922
 torchvision/swin_v2_s_backend_FP32:
   metric_value: 0.83712
 torchvision/swin_v2_s_backend_OV:
   metric_value: 0.83638
 torchvision/swin_v2_s_backend_FX_TORCH:
   metric_value: 0.8360
+torchvision/swin_v2_s_backend_CUDA_FX_TORCH:
+  metric_value: 0.8360
 timm/crossvit_9_240_backend_CUDA_TORCH:
   metric_value: 0.7275
 timm/crossvit_9_240_backend_FP32:
diff --git a/tests/post_training/model_scope.py b/tests/post_training/model_scope.py
@@ -87,7 +87,14 @@
         "model_id": "resnet18",
         "pipeline_cls": ImageClassificationTorchvision,
         "compression_params": {},
-        "backends": [BackendType.FX_TORCH, BackendType.TORCH, BackendType.CUDA_TORCH, BackendType.OV, BackendType.ONNX],
+        "backends": [
+            BackendType.FX_TORCH,
+            BackendType.CUDA_FX_TORCH,
+            BackendType.TORCH,
+            BackendType.CUDA_TORCH,
+            BackendType.OV,
+            BackendType.ONNX,
+        ],
         "batch_size": 128,
     },
     {
@@ -98,7 +105,7 @@
             "fast_bias_correction": False,
             "preset": QuantizationPreset.MIXED,
         },
-        "backends": [BackendType.FX_TORCH, BackendType.OV, BackendType.ONNX],
+        "backends": [BackendType.FX_TORCH, BackendType.CUDA_FX_TORCH, BackendType.OV, BackendType.ONNX],
         "batch_size": 128,
     },
     {
@@ -109,7 +116,7 @@
             "model_type": ModelType.TRANSFORMER,
             "advanced_parameters": AdvancedQuantizationParameters(smooth_quant_alpha=0.15),
         },
-        "backends": [BackendType.FX_TORCH, BackendType.OV],
+        "backends": [BackendType.FX_TORCH, BackendType.CUDA_FX_TORCH, BackendType.OV],
         "batch_size": 1,
     },
     {
@@ -120,7 +127,7 @@
             "model_type": ModelType.TRANSFORMER,
             "advanced_parameters": AdvancedQuantizationParameters(smooth_quant_alpha=0.5),
         },
-        "backends": [BackendType.FX_TORCH, BackendType.OV],
+        "backends": [BackendType.FX_TORCH, BackendType.CUDA_FX_TORCH, BackendType.OV],
         "batch_size": 1,
     },
     # Timm models
diff --git a/tests/post_training/pipelines/base.py b/tests/post_training/pipelines/base.py
@@ -55,6 +55,7 @@ class BackendType(Enum):
     TORCH = "TORCH"
     CUDA_TORCH = "CUDA_TORCH"
     FX_TORCH = "FX_TORCH"
+    CUDA_FX_TORCH = "CUDA_FX_TORCH"
     ONNX = "ONNX"
     OV = "OV"
     OPTIMUM = "OPTIMUM"
@@ -63,6 +64,7 @@ class BackendType(Enum):
 NNCF_PTQ_BACKENDS = [BackendType.TORCH, BackendType.CUDA_TORCH, BackendType.ONNX, BackendType.OV]
 ALL_PTQ_BACKENDS = NNCF_PTQ_BACKENDS
 PT_BACKENDS = [BackendType.TORCH, BackendType.CUDA_TORCH]
+FX_BACKENDS = [BackendType.FX_TORCH, BackendType.CUDA_FX_TORCH]
 OV_BACKENDS = [BackendType.OV, BackendType.OPTIMUM]
 
 LIMIT_LENGTH_OF_STATUS = 120
@@ -222,6 +224,7 @@ def __init__(
         reference_data: dict,
         no_eval: bool,
         run_benchmark_app: bool,
+        torch_compile_validation: bool = False,
         params: dict = None,
         batch_size: int = 1,
         memory_monitor: bool = False,
@@ -238,6 +241,7 @@ def __init__(
         self.memory_monitor = memory_monitor
         self.no_eval = no_eval
         self.run_benchmark_app = run_benchmark_app
+        self.torch_compile_validation = torch_compile_validation
         self.output_model_dir: Path = self.output_dir / self.reported_name / self.backend.value
         self.output_model_dir.mkdir(parents=True, exist_ok=True)
         self.model_name = f"{self.reported_name}_{self.backend.value}"
@@ -436,12 +440,17 @@ def save_compressed_model(self) -> None:
             )
             self.path_compressed_ir = self.output_model_dir / "model.xml"
             ov.serialize(ov_model, self.path_compressed_ir)
-        elif self.backend == BackendType.FX_TORCH:
-            exported_model = torch.export.export(self.compressed_model, (self.dummy_tensor,))
+        elif self.backend in FX_BACKENDS:
+            exported_model = torch.export.export(self.compressed_model.cpu(), (self.dummy_tensor.cpu(),))
             ov_model = ov.convert_model(exported_model, example_input=self.dummy_tensor.cpu(), input=self.input_size)
             ov_model.reshape(self.input_size)
             self.path_compressed_ir = self.output_model_dir / "model.xml"
             ov.serialize(ov_model, self.path_compressed_ir)
+
+            if BackendType.CUDA_FX_TORCH:
+                self.model = self.model.cuda()
+                self.dummy_tensor = self.dummy_tensor.cuda()
+
         elif self.backend == BackendType.ONNX:
             onnx_path = self.output_model_dir / "model.onnx"
             onnx.save(self.compressed_model, str(onnx_path))
diff --git a/tests/post_training/pipelines/image_classification_base.py b/tests/post_training/pipelines/image_classification_base.py
@@ -22,6 +22,7 @@
 import nncf
 from nncf.common.logging.track_progress import track
 from tests.post_training.pipelines.base import DEFAULT_VAL_THREADS
+from tests.post_training.pipelines.base import FX_BACKENDS
 from tests.post_training.pipelines.base import ErrorReport
 from tests.post_training.pipelines.base import PTQTestPipeline
 
@@ -35,18 +36,15 @@ def prepare_calibration_dataset(self):
 
         self.calibration_dataset = nncf.Dataset(loader, self.get_transform_calibration_fn())
 
-    def _validate(self) -> List[ErrorReport]:
-        val_dataset = datasets.ImageFolder(root=self.data_dir / "imagenet" / "val", transform=self.transform)
-        val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, num_workers=2, shuffle=False)
-
-        dataset_size = len(val_loader)
-
-        # Initialize result tensors for async inference support.
-        predictions = np.zeros(dataset_size)
-        references = -1 * np.ones(dataset_size)
+    def _validate_ov(
+        self,
+        val_loader: torch.utils.data.DataLoader,
+        predictions: np.ndarray,
+        references: np.ndarray,
+        dataset_size: int,
+    ):
 
         core = ov.Core()
-
         if os.environ.get("INFERENCE_NUM_THREADS"):
             # Set CPU_THREADS_NUM for OpenVINO inference
             inference_num_threads = os.environ.get("INFERENCE_NUM_THREADS")
@@ -75,6 +73,34 @@ def process_result(request, userdata):
                 references[i] = target
 
             infer_queue.wait_all()
+        return predictions, references
+
+    def _validate_torch_compile(
+        self, val_loader: torch.utils.data.DataLoader, predictions: np.ndarray, references: np.ndarray
+    ):
+        compiled_model = torch.compile(self.compressed_model.cpu(), backend="openvino")
+        for i, (images, target) in enumerate(val_loader):
+            # W/A for memory leaks when using torch DataLoader and OpenVINO
+            pred = compiled_model(images)
+            pred = torch.argmax(pred, dim=1)
+            predictions[i] = pred.numpy()
+            references[i] = target.numpy()
+        return predictions, references
+
+    def _validate(self) -> List[ErrorReport]:
+        val_dataset = datasets.ImageFolder(root=self.data_dir / "imagenet" / "val", transform=self.transform)
+        val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, num_workers=2, shuffle=False)
+
+        dataset_size = len(val_loader)
+
+        # Initialize result tensors for async inference support.
+        predictions = np.zeros(dataset_size)
+        references = -1 * np.ones(dataset_size)
+
+        if self.backend in FX_BACKENDS and self.torch_compile_validation:
+            predictions, references = self._validate_torch_compile(val_loader, predictions, references)
+        else:
+            predictions, references = self._validate_ov(val_loader, predictions, references, dataset_size)
 
         acc_top1 = accuracy_score(predictions, references)
 
diff --git a/tests/post_training/pipelines/image_classification_torchvision.py b/tests/post_training/pipelines/image_classification_torchvision.py
@@ -19,6 +19,7 @@
 from torchvision import models
 
 from nncf.torch import disable_patching
+from tests.post_training.pipelines.base import FX_BACKENDS
 from tests.post_training.pipelines.base import PT_BACKENDS
 from tests.post_training.pipelines.base import BackendType
 from tests.post_training.pipelines.image_classification_base import ImageClassificationBase
@@ -74,9 +75,12 @@ def prepare_model(self) -> None:
         if self.batch_size > 1:  # Dynamic batch_size shape export
             self.input_size[0] = -1
 
-        if self.backend == BackendType.FX_TORCH:
+        if self.backend in FX_BACKENDS:
             with torch.no_grad():
                 with disable_patching():
+                    if self.backend is BackendType.CUDA_FX_TORCH:
+                        model = model.cuda()
+                        self.dummy_tensor = self.dummy_tensor.cuda()
                     self.model = self.model_params.export_fn(model, (self.dummy_tensor,))
 
         elif self.backend in PT_BACKENDS:
@@ -120,20 +124,26 @@ def _dump_model_fp32(self) -> None:
                 )
             ov.serialize(ov_model, self.fp32_model_dir / "model_fp32.xml")
 
-        if self.backend == BackendType.FX_TORCH:
-            exported_model = torch.export.export(self.model, (self.dummy_tensor,))
+        if self.backend in FX_BACKENDS:
+            exported_model = torch.export.export(self.model.cpu(), (self.dummy_tensor.cpu(),))
             ov_model = ov.convert_model(exported_model, example_input=self.dummy_tensor, input=self.input_size)
             ov.serialize(ov_model, self.fp32_model_dir / "fx_model_fp32.xml")
 
+            if self.backend is BackendType.CUDA_FX_TORCH:
+                self.model = self.model.cuda()
+                self.dummy_tensor = self.dummy_tensor.cuda()
+
         if self.backend in [BackendType.FP32, BackendType.OV]:
             ov.serialize(self.model, self.fp32_model_dir / "model_fp32.xml")
 
     def prepare_preprocessor(self) -> None:
         self.transform = self.model_params.weights.transforms()
 
     def get_transform_calibration_fn(self):
-        if self.backend in [BackendType.FX_TORCH] + PT_BACKENDS:
-            device = torch.device("cuda" if self.backend == BackendType.CUDA_TORCH else "cpu")
+        if self.backend in FX_BACKENDS + PT_BACKENDS:
+            device = torch.device(
+                "cuda" if self.backend in [BackendType.CUDA_TORCH, BackendType.CUDA_FX_TORCH] else "cpu"
+            )
 
             def transform_fn(data_item):
                 images, _ = data_item
diff --git a/tests/post_training/test_quantize_conformance.py b/tests/post_training/test_quantize_conformance.py
@@ -90,6 +90,11 @@ def fixture_run_benchmark_app(pytestconfig):
     return pytestconfig.getoption("benchmark")
 
 
+@pytest.fixture(scope="session", name="torch_compile_validation")
+def fixture_torch_compile_validation(pytestconfig):
+    return pytestconfig.getoption("torch_compile_validation")
+
+
 @pytest.fixture(scope="session", name="extra_columns")
 def fixture_extra_columns(pytestconfig):
     return pytestconfig.getoption("extra_columns")
@@ -281,6 +286,7 @@ def test_ptq_quantization(
     run_torch_cuda_backend: bool,
     subset_size: Optional[int],
     run_benchmark_app: bool,
+    torch_compile_validation: bool,
     capsys: pytest.CaptureFixture,
     extra_columns: bool,
     memory_monitor: bool,
@@ -309,6 +315,7 @@ def test_ptq_quantization(
                 "data_dir": data_dir,
                 "no_eval": no_eval,
                 "run_benchmark_app": run_benchmark_app,
+                "torch_compile_validation": torch_compile_validation,
                 "batch_size": batch_size,
                 "memory_monitor": memory_monitor,
             }
diff --git a/tests/torch/data/reference_graphs/fx/transformed/folded_scalar_clone_model.dot b/tests/torch/data/reference_graphs/fx/transformed/folded_scalar_clone_model.dot
diff --git a/tests/torch/fx/test_model_transformer.py b/tests/torch/fx/test_model_transformer.py
diff --git a/tests/torch/test_models/synthetic.py b/tests/torch/test_models/synthetic.py