Allow GPTQModel to auto select Marlin or faster kernels for inference only ops (#2138)

LRL-ModelCloud · Qubitium · web-flow · commit 53240c3f6804 · 2025-01-08T14:20:41.000+01:00
* select quant_linear with pack

* up GPTQMODEL_MINIMUM_VERSION

* Update quantizer.py

* update gptqmodel version

---------

Co-authored-by: Qubitium-ModelCloud &lt;qubitium@modelcloud.ai&gt;
diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py
@@ -220,7 +220,7 @@ def __init__(
                 )
         self.exllama_version = self.exllama_config["version"]
 
-    def select_quant_linear(self, device_map: Union[str, dict]):
+    def select_quant_linear(self, device_map: Union[str, dict], pack: bool = False):
         if is_gptqmodel_available():
             self.quant_linear = hf_select_quant_linear(
                 bits=self.bits,
@@ -231,6 +231,7 @@ def select_quant_linear(self, device_map: Union[str, dict]):
                 meta=self.meta,
                 device_map=device_map,
                 backend=self.backend,
+                pack=pack,
             )
         else:
             self.quant_linear = hf_select_quant_linear(
@@ -301,7 +302,7 @@ def convert_model(self, model: nn.Module, **kwargs):
                     )
                     del layers_to_be_replaced[name]
 
-        self.select_quant_linear(device_map=kwargs.get("device_map", None))
+        self.select_quant_linear(device_map=kwargs.get("device_map", None), pack=False)
 
         self._replace_by_quant_layers(model, layers_to_be_replaced)
 
@@ -761,7 +762,7 @@ def pack_model(
         layers = get_layers(model)
         layers = {n: layers[n] for n in quantizers}
 
-        self.select_quant_linear(device_map=model.hf_device_map)
+        self.select_quant_linear(device_map=model.hf_device_map, pack=True)
 
         self._replace_by_quant_layers(model, quantizers)
         qlayers = get_layers(model, [self.quant_linear])
diff --git a/optimum/utils/import_utils.py b/optimum/utils/import_utils.py
@@ -52,7 +52,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 TRANSFORMERS_MINIMUM_VERSION = version.parse("4.25.0")
 DIFFUSERS_MINIMUM_VERSION = version.parse("0.22.0")
 AUTOGPTQ_MINIMUM_VERSION = version.parse("0.4.99")  # Allows 0.5.0.dev0
-GPTQMODEL_MINIMUM_VERSION = version.parse("1.4.2")
+GPTQMODEL_MINIMUM_VERSION = version.parse("1.6.0")
 
 
 # This is the minimal required version to support some ONNX Runtime features