Disable the exllama on all non-cuda devices. (#2003)

yuanwu2017 · IlyasMoutawwakil · web-flow · commit 2179d33a5a95 · 2024-09-18T10:30:52.000+02:00
* Disable the exllama on all non-cuda devices.

1. Disable the exllama on all non-cuda devices.
2. Don't raise the error when running on non-cuda device.

Signed-off-by: yuanwu &lt;yuan.wu@intel.com&gt;

* Refine the code

Signed-off-by: yuanwu &lt;yuan.wu@intel.com&gt;

* Fix errors of make style

Signed-off-by: yuanwu &lt;yuan.wu@intel.com&gt;

* Add hpu device

Signed-off-by: yuanwu &lt;yuan.wu@intel.com&gt;

* Update optimum/gptq/constants.py

Co-authored-by: Ilyas Moutawwakil &lt;57442720+IlyasMoutawwakil@users.noreply.github.com&gt;

* Update optimum/gptq/quantizer.py

Co-authored-by: Ilyas Moutawwakil &lt;57442720+IlyasMoutawwakil@users.noreply.github.com&gt;

* Update optimum/gptq/quantizer.py

Co-authored-by: Ilyas Moutawwakil &lt;57442720+IlyasMoutawwakil@users.noreply.github.com&gt;

* Update optimum/gptq/quantizer.py

Co-authored-by: Ilyas Moutawwakil &lt;57442720+IlyasMoutawwakil@users.noreply.github.com&gt;

* Fix error of make style

Signed-off-by: yuanwu &lt;yuan.wu@intel.com&gt;

---------

Signed-off-by: yuanwu &lt;yuan.wu@intel.com&gt;
Co-authored-by: Ilyas Moutawwakil &lt;57442720+IlyasMoutawwakil@users.noreply.github.com&gt;
diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py
@@ -546,7 +546,7 @@ def tmp(_, input, output):
 
         if self.bits == 4:
             # device not on gpu
-            if device == torch.device("cpu") or (has_device_map and any(d in devices for d in ["cpu", "disk"])):
+            if device.type != "cuda" or (has_device_map and any(d in devices for d in ["cpu", "disk", "hpu"])):
                 if not self.disable_exllama:
                     logger.warning(
                         "Found modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllama=True`"
@@ -589,13 +589,14 @@ def post_init_model(self, model):
                 The input model
         """
         if self.bits == 4 and not self.disable_exllama:
-            if get_device(model) == torch.device("cpu") or (
-                hasattr(model, "hf_device_map") and any(d in model.hf_device_map for d in ["cpu", "disk"])
+            if get_device(model).type != "cuda" or (
+                hasattr(model, "hf_device_map") and any(d in model.hf_device_map for d in ["cpu", "disk", "hpu"])
             ):
-                raise ValueError(
-                    "Found modules on cpu/disk. Using Exllama or Exllamav2 backend requires all the modules to be on GPU."
-                    "You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object"
-                )
+                if not self.disable_exllama:
+                    logger.warning(
+                        "Found modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllama=True`"
+                    )
+                    self.disable_exllama = True
 
         class StoreAttr(object):
             pass