[Intel GPU] XPUInductorQuantizer for XPU int8 recipe customization (pytorch#139578)

ZhiweiYan-96 · pytorchmergebot · commit c418a9ac7501 · 2024-11-26T09:44:14.000Z
# Motivation This PR add `XPUInductorQuantizer`, which would defined the recipe of int8 quantization at XPU backend. # Detailed The `XPUInductorQuantizer` is class derived from `X86InductorQuantizer` as both quantizer would take the advantage of highly optimized operators in oneDNN library(qconv, qlinear, qconv/qlinear fusion). We share the same recipe as `X86InductorQuantizer`, so we would have same `annotate_xxxx` methods. So, in ideal situation, the `XPUInductorQuantizer` would have no class body as all implementation can inherit from base class. In this PR, we override the `annotate_xxx` method for operators that has NOT be implemented. All operators XPU backend does not implement would be fallbacked to fp32 implementation as the node in graph is a `dq-op-q` pairs. This would help provide good OOB usability for XPU backend. On the other hand, the implemented operators would uses `annotate_op` implemented in base class and could be lowered successfully. Pull Request resolved: pytorch#139578 Approved by: https://github.com/EikanWang, https://github.com/leslie-fang-intel, https://github.com/CuiYifeng, https://github.com/jerryzh168 ghstack dependencies: pytorch#133080
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -2457,6 +2457,8 @@
     "SharedQuantizationSpec",
     # torch.ao.quantization.quantizer.x86_inductor_quantizer
     "X86InductorQuantizer",
+    # torch.ao.quantization.quantizer.xpu_inductor_quantizer
+    "XPUInductorQuantizer",
     # torch.ao.quantization.quantizer.xnnpack_quantizer
     "XNNPACKQuantizer",
     # torch.ao.quantization.quantizer.xnnpack_quantizer_utils
diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
@@ -1353,6 +1353,7 @@ Please take a look at `Limitations of Symbolic Tracing <https://pytorch.org/docs
 .. py:module:: torch.ao.quantization.quantizer.quantizer
 .. py:module:: torch.ao.quantization.quantizer.utils
 .. py:module:: torch.ao.quantization.quantizer.x86_inductor_quantizer
+.. py:module:: torch.ao.quantization.quantizer.xpu_inductor_quantizer
 .. py:module:: torch.ao.quantization.quantizer.xnnpack_quantizer
 .. py:module:: torch.ao.quantization.quantizer.xnnpack_quantizer_utils
 .. py:module:: torch.ao.quantization.stubs
diff --git a/torch/_inductor/fx_passes/quantization.py b/torch/_inductor/fx_passes/quantization.py
@@ -63,7 +63,7 @@ def _get_pattern_output_dtype(match: Match):
     output_node = pattern_output_nodes[0]
     assert isinstance(output_node, torch.fx.Node)
     output_dtype = output_node.meta["val"].dtype
-    assert output_dtype in [torch.uint8, torch.float32, torch.bfloat16]
+    assert output_dtype in [torch.int8, torch.uint8, torch.float32, torch.bfloat16]
     return output_dtype
 
 
@@ -335,10 +335,18 @@ def qconv(match: Match, *args, **kwargs):
             kwargs["groups"],
         )
         output_dtype = _get_pattern_output_dtype(match)
-        assert output_dtype in [torch.uint8, torch.float32, torch.bfloat16]
+        assert output_dtype in [torch.int8, torch.uint8, torch.float32, torch.bfloat16]
         # Output QParams
-        o_inv_scale = kwargs["o_inv_scale"] if output_dtype == torch.uint8 else 1.0
-        o_zero_point = kwargs["o_zp"] if output_dtype == torch.uint8 else 0
+        o_inv_scale = (
+            kwargs["o_inv_scale"]
+            if (output_dtype == torch.uint8 or output_dtype == torch.int8)
+            else 1.0
+        )
+        o_zero_point = (
+            kwargs["o_zp"]
+            if (output_dtype == torch.uint8 or output_dtype == torch.int8)
+            else 0
+        )
         assert (
             kwargs["attr"] == "none"
         )  # Expected no post op fused in weight prepack phase
diff --git a/torch/ao/quantization/quantizer/xpu_inductor_quantizer.py b/torch/ao/quantization/quantizer/xpu_inductor_quantizer.py
@@ -0,0 +1,152 @@
+# mypy: allow-untyped-defs
+import functools
+from typing import Any, Dict, Optional, TYPE_CHECKING
+
+import torch
+from torch.ao.quantization.observer import HistogramObserver, PerChannelMinMaxObserver
+from torch.ao.quantization.quantizer.quantizer import QuantizationSpec
+from torch.ao.quantization.quantizer.x86_inductor_quantizer import (
+    _is_any_annotated,
+    FilterFn,
+    int8_in_int8_out_ops,
+    X86InductorQuantizer,
+)
+from torch.ao.quantization.quantizer.xnnpack_quantizer_utils import QuantizationConfig
+from torch.fx import Node
+
+
+if TYPE_CHECKING:
+    from torch.ao.quantization.qconfig import _ObserverOrFakeQuantizeConstructor
+
+__all__ = [
+    "XPUInductorQuantizer",
+    "get_default_xpu_inductor_quantization_config",
+]
+
+
+@functools.lru_cache
+def get_default_xpu_inductor_quantization_config():
+    extra_args: Dict[str, Any] = {"eps": 2**-12}
+    act_observer_or_fake_quant_ctr = HistogramObserver
+    act_quantization_spec = QuantizationSpec(
+        dtype=torch.int8,
+        quant_min=-128,
+        quant_max=127,
+        qscheme=torch.per_tensor_affine,
+        is_dynamic=False,
+        observer_or_fake_quant_ctr=act_observer_or_fake_quant_ctr.with_args(
+            **extra_args
+        ),
+    )
+
+    weight_observer_or_fake_quant_ctr: _ObserverOrFakeQuantizeConstructor = (
+        PerChannelMinMaxObserver
+    )
+
+    weight_quantization_spec = QuantizationSpec(
+        dtype=torch.int8,
+        quant_min=-128,
+        quant_max=127,
+        qscheme=torch.per_channel_symmetric,
+        ch_axis=0,  # 0 corresponding to weight shape = (oc, ic, kh, kw) of conv
+        is_dynamic=False,
+        observer_or_fake_quant_ctr=weight_observer_or_fake_quant_ctr.with_args(
+            **extra_args
+        ),
+    )
+
+    bias_quantization_spec = None  # will use placeholder observer by default
+    quantization_config = QuantizationConfig(
+        act_quantization_spec,
+        act_quantization_spec,
+        weight_quantization_spec,
+        bias_quantization_spec,
+        False,
+    )
+    return quantization_config
+
+
+class XPUInductorQuantizer(X86InductorQuantizer):
+    """
+    XPUInductorQuantizer is a class designed to facilitate
+    quantization capability at Intel GPU backend. The class
+    highly reuses the existing implementation of
+    X86InductorQuantizer as both are intended to take advantage
+    of the optimized kernels in oneDNN library.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    """
+        Following annotate_xx overrides the impls in base class, as
+        no XPU implementation for these operators currently. We would
+        gradually enable the XPU implementation and remove following
+        overrides. We keep the annotate methods but make the function
+        body empty, aiming to let `_generate_qdq_quantized_model`
+        generate qdq around op and graph execute on fp32 dtype for
+        unspported operators.
+    """
+
+    def _annotate_qat_conv2d_fusion_pattern(
+        self,
+        model: torch.fx.GraphModule,
+        quantization_config: Optional[QuantizationConfig],
+        filter_fn: Optional[FilterFn] = None,
+    ):
+        pass
+
+    def _annotate_conv2d_binary(
+        self,
+        gm: torch.fx.GraphModule,
+        quantization_config: Optional[QuantizationConfig],
+        filter_fn: Optional[FilterFn] = None,
+    ) -> None:
+        pass
+
+    def _annotate_conv2d_binary_unary(
+        self,
+        gm: torch.fx.GraphModule,
+        quantization_config: Optional[QuantizationConfig],
+        filter_fn: Optional[FilterFn] = None,
+    ) -> None:
+        pass
+
+    def _annotate_linear_fusion_pattern(
+        self,
+        model: torch.fx.GraphModule,
+        quantization_config: Optional[QuantizationConfig],
+        filter_fn: Optional[FilterFn] = None,
+    ):
+        pass
+
+    def _annotate_matmul(
+        self,
+        model: torch.fx.GraphModule,
+        quantization_config: Optional[QuantizationConfig],
+        filter_fn: Optional[FilterFn] = None,
+    ):
+        pass
+
+    def _annotate_maxpool2d(
+        self,
+        node: Node,
+        quantization_config: Optional[QuantizationConfig],
+    ) -> None:
+        """
+        Here we skip the annotate logic for maxpool at XPU backend
+        as the quantized::max_pool2d is only implemented for CPU.
+        """
+        return
+
+    def _annotate_output_for_int8_in_int8_out_pattern(
+        self,
+        node: Node,
+    ) -> None:
+        if (node.target in int8_in_int8_out_ops) and (_is_any_annotated([node])):
+            if node.target == torch.ops.aten.max_pool2d.default:
+                return
+            else:
+                input_node = node.all_input_nodes[0]
+                self._annotate_output_share_observer_as_input(input_node, node)
+        return
diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py
@@ -75,7 +75,9 @@
 from typing import Callable, Tuple, Dict, Any, Union, Type, Optional
 import torch._dynamo as torchdynamo
 import torch.ao.quantization.quantizer.x86_inductor_quantizer as xiq
+import torch.ao.quantization.quantizer.xpu_inductor_quantizer as xpuiq
 from torch.ao.quantization.quantizer.x86_inductor_quantizer import X86InductorQuantizer
+from torch.ao.quantization.quantizer.xpu_inductor_quantizer import XPUInductorQuantizer
 import contextlib
 
 class NodeSpec:
@@ -2940,13 +2942,20 @@ def _generate_qdq_quantized_model(
     mod, inputs, is_qat=False, is_dynamic=False, quantizer=None
 ):
 
-    def get_default_quantizer(is_qat, is_dynamic):
-        quantizer = X86InductorQuantizer()
-        quantizer.set_global(
-            xiq.get_default_x86_inductor_quantization_config(
-                is_qat=is_qat, is_dynamic=is_dynamic
+    def get_default_quantizer(is_qat, is_dynamic, inputs):
+        has_xpu = any(isinstance(input, torch.Tensor) and input.device.type == "xpu"
+                      for input in inputs)
+        if has_xpu:
+            quantizer = XPUInductorQuantizer()
+            assert (not is_qat) and (not is_dynamic), "QAT and dynamic quantization is not supported at XPU backend currently"
+            quantizer.set_global(xpuiq.get_default_xpu_inductor_quantization_config())
+        else:
+            quantizer = X86InductorQuantizer()
+            quantizer.set_global(
+                xiq.get_default_x86_inductor_quantization_config(
+                    is_qat=is_qat, is_dynamic=is_dynamic
+                )
             )
-        )
         return quantizer
 
     maybe_no_grad = contextlib.nullcontext() if is_qat else torch.no_grad()
@@ -2956,7 +2965,7 @@ def get_default_quantizer(is_qat, is_dynamic):
             inputs,
         ).module()
         quantizer = (
-            quantizer if quantizer else get_default_quantizer(is_qat, is_dynamic)
+            quantizer if quantizer else get_default_quantizer(is_qat, is_dynamic, inputs)
         )
         prepare_model = (
             prepare_qat_pt2e(export_model, quantizer)